groupchat/server/src/services/fetch.rs

use scraper::{Html, Selector};
use std::time::Duration;

/// Result of fetching and extracting content from a URL.
#[derive(Debug)]
pub struct FetchResult {
    pub url: String,
    pub title: Option<String>,
    pub description: Option<String>,
    pub content: String,
    pub content_length: usize,
    pub is_truncated: bool,
}

/// Tags to skip entirely during text extraction (noise elements).
const STRIP_TAGS: &[&str] = &[
    "script", "style", "noscript", "svg", "iframe", "nav", "footer", "aside",
];

/// Block-level tags that should produce newlines in text output.
const BLOCK_TAGS: &[&str] = &[
    "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "br", "tr",
    "blockquote", "pre", "section", "article", "main", "header",
    "dt", "dd", "figcaption", "table", "thead", "tbody",
];

/// Fetch a URL and extract its text content.
/// Returns structured result with metadata and cleaned content.
pub async fn fetch_url(url: &str, max_chars: usize) -> Result<FetchResult, String> {
    // Validate URL
    if !url.starts_with("http://") && !url.starts_with("https://") {
        return Err("URL must start with http:// or https://".into());
    }

    let client = reqwest::Client::builder()
        .timeout(Duration::from_secs(30))
        .connect_timeout(Duration::from_secs(10))
        .redirect(reqwest::redirect::Policy::limited(10))
        .user_agent("GroupChat-AI/1.0 (Web Fetch Tool)")
        .gzip(true)
        .brotli(true)
        .deflate(true)
        .build()
        .map_err(|e| format!("Failed to build HTTP client: {}", e))?;

    let response = client
        .get(url)
        .send()
        .await
        .map_err(|e| format!("Failed to fetch URL: {}", e))?;

    let final_url = response.url().to_string();
    let status = response.status();

    if !status.is_success() {
        return Err(format!(
            "HTTP error {}: {}",
            status.as_u16(),
            status.canonical_reason().unwrap_or("Unknown")
        ));
    }

    // Check content type
    let content_type = response
        .headers()
        .get("content-type")
        .and_then(|v| v.to_str().ok())
        .unwrap_or("text/html")
        .to_lowercase();

    if !is_text_content(&content_type) {
        return Err(format!(
            "Cannot read non-text content type: {}",
            content_type
        ));
    }

    // Check content length (reject > 2MB)
    if let Some(len) = response.content_length() {
        if len > 2 * 1024 * 1024 {
            return Err(format!("Content too large: {} bytes (max 2MB)", len));
        }
    }

    let body = response
        .text()
        .await
        .map_err(|e| format!("Failed to read response body: {}", e))?;

    // If it's plain text or JSON, return as-is (with truncation)
    if content_type.contains("json") || content_type.starts_with("text/plain") {
        let content_length = body.len();
        let is_truncated = content_length > max_chars;
        let content = if is_truncated {
            truncate_at_boundary(&body, max_chars)
        } else {
            body
        };

        return Ok(FetchResult {
            url: final_url,
            title: None,
            description: None,
            content,
            content_length,
            is_truncated,
        });
    }

    // Parse HTML
    let document = Html::parse_document(&body);

    // Extract metadata
    let title = extract_title(&document);
    let description = extract_description(&document);

    // Extract text content
    let text = extract_text(&document);
    let content_length = text.len();
    let is_truncated = content_length > max_chars;
    let content = if is_truncated {
        truncate_at_boundary(&text, max_chars)
    } else {
        text
    };

    Ok(FetchResult {
        url: final_url,
        title,
        description,
        content,
        content_length,
        is_truncated,
    })
}

fn is_text_content(ct: &str) -> bool {
    ct.starts_with("text/")
        || ct.contains("json")
        || ct.contains("xml")
        || ct.contains("javascript")
}

fn extract_title(doc: &Html) -> Option<String> {
    // Try og:title first
    if let Ok(og_sel) = Selector::parse(r#"meta[property="og:title"]"#) {
        if let Some(el) = doc.select(&og_sel).next() {
            if let Some(content) = el.value().attr("content") {
                let t = content.trim();
                if !t.is_empty() {
                    return Some(t.to_string());
                }
            }
        }
    }

    // Fall back to <title>
    if let Ok(title_sel) = Selector::parse("title") {
        if let Some(el) = doc.select(&title_sel).next() {
            let t: String = el.text().collect::<String>().trim().to_string();
            if !t.is_empty() {
                return Some(t);
            }
        }
    }

    None
}

fn extract_description(doc: &Html) -> Option<String> {
    // Try og:description
    if let Ok(og_sel) = Selector::parse(r#"meta[property="og:description"]"#) {
        if let Some(el) = doc.select(&og_sel).next() {
            if let Some(content) = el.value().attr("content") {
                let d = content.trim();
                if !d.is_empty() {
                    return Some(d.to_string());
                }
            }
        }
    }

    // Fall back to meta description
    if let Ok(meta_sel) = Selector::parse(r#"meta[name="description"]"#) {
        if let Some(el) = doc.select(&meta_sel).next() {
            if let Some(content) = el.value().attr("content") {
                let d = content.trim();
                if !d.is_empty() {
                    return Some(d.to_string());
                }
            }
        }
    }

    None
}

/// Extract text content from HTML, stripping noise elements.
fn extract_text(doc: &Html) -> String {
    let mut output = String::new();

    // Try to find main content area first
    let main_selectors = ["main", "article", "[role=\"main\"]", "#content", ".content"];
    let mut root = None;
    for sel_str in &main_selectors {
        if let Ok(sel) = Selector::parse(sel_str) {
            if let Some(el) = doc.select(&sel).next() {
                root = Some(el);
                break;
            }
        }
    }

    // Fall back to body
    if root.is_none() {
        if let Ok(body_sel) = Selector::parse("body") {
            root = doc.select(&body_sel).next();
        }
    }

    if let Some(root_el) = root {
        extract_node_text(&root_el, &mut output);
    }

    // Clean up: collapse excessive whitespace/newlines
    let mut cleaned = String::new();
    let mut blank_count = 0;
    for line in output.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            blank_count += 1;
            if blank_count <= 2 {
                cleaned.push('\n');
            }
        } else {
            blank_count = 0;
            cleaned.push_str(trimmed);
            cleaned.push('\n');
        }
    }

    cleaned.trim().to_string()
}

/// Recursively extract text from a node, skipping noise elements.
fn extract_node_text(element: &scraper::ElementRef, output: &mut String) {
    for child in element.children() {
        match child.value() {
            scraper::node::Node::Text(text) => {
                let t = text.trim();
                if !t.is_empty() {
                    output.push_str(t);
                    output.push(' ');
                }
            }
            scraper::node::Node::Element(el) => {
                let tag = el.name();

                // Skip noise elements entirely
                if STRIP_TAGS.contains(&tag) {
                    continue;
                }

                let child_ref = scraper::ElementRef::wrap(child);
                if let Some(child_el) = child_ref {
                    // Add newline before block elements
                    let is_block = BLOCK_TAGS.contains(&tag);
                    if is_block {
                        output.push('\n');
                    }

                    // Handle headings specially — add markdown-style prefix
                    match tag {
                        "h1" => output.push_str("# "),
                        "h2" => output.push_str("## "),
                        "h3" => output.push_str("### "),
                        "li" => output.push_str("- "),
                        "a" => {
                            // Extract link text and URL
                            let text: String = child_el.text().collect();
                            let href = el.attr("href").unwrap_or("");
                            if !text.trim().is_empty()
                                && !href.is_empty()
                                && href.starts_with("http")
                            {
                                output.push_str(&format!("[{}]({})", text.trim(), href));
                                output.push(' ');
                                continue; // Don't recurse into <a> children
                            }
                        }
                        _ => {}
                    }

                    extract_node_text(&child_el, output);

                    if is_block {
                        output.push('\n');
                    }
                }
            }
            _ => {}
        }
    }
}

/// Truncate content at a paragraph or line boundary.
fn truncate_at_boundary(text: &str, max_chars: usize) -> String {
    if text.len() <= max_chars {
        return text.to_string();
    }

    let slice = &text[..max_chars];

    // Try to break at a double newline (paragraph boundary)
    if let Some(pos) = slice.rfind("\n\n") {
        return slice[..pos].to_string();
    }

    // Try to break at a single newline
    if let Some(pos) = slice.rfind('\n') {
        return slice[..pos].to_string();
    }

    // Try word boundary
    if let Some(pos) = slice.rfind(' ') {
        return slice[..pos].to_string();
    }

    // Last resort: hard cut
    slice.to_string()
}

/// Format a fetch result into a readable string for the AI.
pub fn format_result(result: &FetchResult) -> String {
    let mut output = String::new();

    output.push_str(&format!("URL: {}\n", result.url));
    if let Some(title) = &result.title {
        output.push_str(&format!("Title: {}\n", title));
    }
    if let Some(desc) = &result.description {
        output.push_str(&format!("Description: {}\n", desc));
    }
    output.push_str(&format!(
        "Content length: {} characters",
        result.content_length
    ));
    if result.is_truncated {
        output.push_str(" (truncated)");
    }
    output.push_str("\n\n");
    output.push_str(&result.content);

    output
}