use scraper::{Html, Selector}; use std::time::Duration; /// Result of fetching and extracting content from a URL. #[derive(Debug)] pub struct FetchResult { pub url: String, pub title: Option, pub description: Option, pub content: String, pub content_length: usize, pub is_truncated: bool, } /// Tags to skip entirely during text extraction (noise elements). const STRIP_TAGS: &[&str] = &[ "script", "style", "noscript", "svg", "iframe", "nav", "footer", "aside", ]; /// Block-level tags that should produce newlines in text output. const BLOCK_TAGS: &[&str] = &[ "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "br", "tr", "blockquote", "pre", "section", "article", "main", "header", "dt", "dd", "figcaption", "table", "thead", "tbody", ]; /// Fetch a URL and extract its text content. /// Returns structured result with metadata and cleaned content. pub async fn fetch_url(url: &str, max_chars: usize) -> Result { // Validate URL if !url.starts_with("http://") && !url.starts_with("https://") { return Err("URL must start with http:// or https://".into()); } let client = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .connect_timeout(Duration::from_secs(10)) .redirect(reqwest::redirect::Policy::limited(10)) .user_agent("GroupChat-AI/1.0 (Web Fetch Tool)") .gzip(true) .brotli(true) .deflate(true) .build() .map_err(|e| format!("Failed to build HTTP client: {}", e))?; let response = client .get(url) .send() .await .map_err(|e| format!("Failed to fetch URL: {}", e))?; let final_url = response.url().to_string(); let status = response.status(); if !status.is_success() { return Err(format!( "HTTP error {}: {}", status.as_u16(), status.canonical_reason().unwrap_or("Unknown") )); } // Check content type let content_type = response .headers() .get("content-type") .and_then(|v| v.to_str().ok()) .unwrap_or("text/html") .to_lowercase(); if !is_text_content(&content_type) { return Err(format!( "Cannot read non-text content type: {}", content_type )); } // Check content length (reject > 2MB) if let Some(len) = response.content_length() { if len > 2 * 1024 * 1024 { return Err(format!("Content too large: {} bytes (max 2MB)", len)); } } let body = response .text() .await .map_err(|e| format!("Failed to read response body: {}", e))?; // If it's plain text or JSON, return as-is (with truncation) if content_type.contains("json") || content_type.starts_with("text/plain") { let content_length = body.len(); let is_truncated = content_length > max_chars; let content = if is_truncated { truncate_at_boundary(&body, max_chars) } else { body }; return Ok(FetchResult { url: final_url, title: None, description: None, content, content_length, is_truncated, }); } // Parse HTML let document = Html::parse_document(&body); // Extract metadata let title = extract_title(&document); let description = extract_description(&document); // Extract text content let text = extract_text(&document); let content_length = text.len(); let is_truncated = content_length > max_chars; let content = if is_truncated { truncate_at_boundary(&text, max_chars) } else { text }; Ok(FetchResult { url: final_url, title, description, content, content_length, is_truncated, }) } fn is_text_content(ct: &str) -> bool { ct.starts_with("text/") || ct.contains("json") || ct.contains("xml") || ct.contains("javascript") } fn extract_title(doc: &Html) -> Option { // Try og:title first if let Ok(og_sel) = Selector::parse(r#"meta[property="og:title"]"#) { if let Some(el) = doc.select(&og_sel).next() { if let Some(content) = el.value().attr("content") { let t = content.trim(); if !t.is_empty() { return Some(t.to_string()); } } } } // Fall back to if let Ok(title_sel) = Selector::parse("title") { if let Some(el) = doc.select(&title_sel).next() { let t: String = el.text().collect::<String>().trim().to_string(); if !t.is_empty() { return Some(t); } } } None } fn extract_description(doc: &Html) -> Option<String> { // Try og:description if let Ok(og_sel) = Selector::parse(r#"meta[property="og:description"]"#) { if let Some(el) = doc.select(&og_sel).next() { if let Some(content) = el.value().attr("content") { let d = content.trim(); if !d.is_empty() { return Some(d.to_string()); } } } } // Fall back to meta description if let Ok(meta_sel) = Selector::parse(r#"meta[name="description"]"#) { if let Some(el) = doc.select(&meta_sel).next() { if let Some(content) = el.value().attr("content") { let d = content.trim(); if !d.is_empty() { return Some(d.to_string()); } } } } None } /// Extract text content from HTML, stripping noise elements. fn extract_text(doc: &Html) -> String { let mut output = String::new(); // Try to find main content area first let main_selectors = ["main", "article", "[role=\"main\"]", "#content", ".content"]; let mut root = None; for sel_str in &main_selectors { if let Ok(sel) = Selector::parse(sel_str) { if let Some(el) = doc.select(&sel).next() { root = Some(el); break; } } } // Fall back to body if root.is_none() { if let Ok(body_sel) = Selector::parse("body") { root = doc.select(&body_sel).next(); } } if let Some(root_el) = root { extract_node_text(&root_el, &mut output); } // Clean up: collapse excessive whitespace/newlines let mut cleaned = String::new(); let mut blank_count = 0; for line in output.lines() { let trimmed = line.trim(); if trimmed.is_empty() { blank_count += 1; if blank_count <= 2 { cleaned.push('\n'); } } else { blank_count = 0; cleaned.push_str(trimmed); cleaned.push('\n'); } } cleaned.trim().to_string() } /// Recursively extract text from a node, skipping noise elements. fn extract_node_text(element: &scraper::ElementRef, output: &mut String) { for child in element.children() { match child.value() { scraper::node::Node::Text(text) => { let t = text.trim(); if !t.is_empty() { output.push_str(t); output.push(' '); } } scraper::node::Node::Element(el) => { let tag = el.name(); // Skip noise elements entirely if STRIP_TAGS.contains(&tag) { continue; } let child_ref = scraper::ElementRef::wrap(child); if let Some(child_el) = child_ref { // Add newline before block elements let is_block = BLOCK_TAGS.contains(&tag); if is_block { output.push('\n'); } // Handle headings specially — add markdown-style prefix match tag { "h1" => output.push_str("# "), "h2" => output.push_str("## "), "h3" => output.push_str("### "), "li" => output.push_str("- "), "a" => { // Extract link text and URL let text: String = child_el.text().collect(); let href = el.attr("href").unwrap_or(""); if !text.trim().is_empty() && !href.is_empty() && href.starts_with("http") { output.push_str(&format!("[{}]({})", text.trim(), href)); output.push(' '); continue; // Don't recurse into <a> children } } _ => {} } extract_node_text(&child_el, output); if is_block { output.push('\n'); } } } _ => {} } } } /// Truncate content at a paragraph or line boundary. fn truncate_at_boundary(text: &str, max_chars: usize) -> String { if text.len() <= max_chars { return text.to_string(); } let slice = &text[..max_chars]; // Try to break at a double newline (paragraph boundary) if let Some(pos) = slice.rfind("\n\n") { return slice[..pos].to_string(); } // Try to break at a single newline if let Some(pos) = slice.rfind('\n') { return slice[..pos].to_string(); } // Try word boundary if let Some(pos) = slice.rfind(' ') { return slice[..pos].to_string(); } // Last resort: hard cut slice.to_string() } /// Format a fetch result into a readable string for the AI. pub fn format_result(result: &FetchResult) -> String { let mut output = String::new(); output.push_str(&format!("URL: {}\n", result.url)); if let Some(title) = &result.title { output.push_str(&format!("Title: {}\n", title)); } if let Some(desc) = &result.description { output.push_str(&format!("Description: {}\n", desc)); } output.push_str(&format!( "Content length: {} characters", result.content_length )); if result.is_truncated { output.push_str(" (truncated)"); } output.push_str("\n\n"); output.push_str(&result.content); output }