Full-stack real-time group chat with Rust/Axum backend and Riot.js frontend. Features: - Auth (register/login/JWT), rooms, invites, WebSocket messaging - AI responses via OpenRouter with tool calling (Brave Search + web fetch) - Real-time tool usage indicators (searching/reading page) - Collapsible tool results in message bubbles - AI stats bar (model, tokens, speed, response time) persisted to DB - Room soft-delete, /clear command, dynamic model fetching - Markdown rendering with code highlighting and copy buttons Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
356 lines
10 KiB
Rust
356 lines
10 KiB
Rust
use scraper::{Html, Selector};
|
|
use std::time::Duration;
|
|
|
|
/// Result of fetching and extracting content from a URL.
|
|
#[derive(Debug)]
|
|
pub struct FetchResult {
|
|
pub url: String,
|
|
pub title: Option<String>,
|
|
pub description: Option<String>,
|
|
pub content: String,
|
|
pub content_length: usize,
|
|
pub is_truncated: bool,
|
|
}
|
|
|
|
/// Tags to skip entirely during text extraction (noise elements).
|
|
const STRIP_TAGS: &[&str] = &[
|
|
"script", "style", "noscript", "svg", "iframe", "nav", "footer", "aside",
|
|
];
|
|
|
|
/// Block-level tags that should produce newlines in text output.
|
|
const BLOCK_TAGS: &[&str] = &[
|
|
"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "br", "tr",
|
|
"blockquote", "pre", "section", "article", "main", "header",
|
|
"dt", "dd", "figcaption", "table", "thead", "tbody",
|
|
];
|
|
|
|
/// Fetch a URL and extract its text content.
|
|
/// Returns structured result with metadata and cleaned content.
|
|
pub async fn fetch_url(url: &str, max_chars: usize) -> Result<FetchResult, String> {
|
|
// Validate URL
|
|
if !url.starts_with("http://") && !url.starts_with("https://") {
|
|
return Err("URL must start with http:// or https://".into());
|
|
}
|
|
|
|
let client = reqwest::Client::builder()
|
|
.timeout(Duration::from_secs(30))
|
|
.connect_timeout(Duration::from_secs(10))
|
|
.redirect(reqwest::redirect::Policy::limited(10))
|
|
.user_agent("GroupChat-AI/1.0 (Web Fetch Tool)")
|
|
.gzip(true)
|
|
.brotli(true)
|
|
.deflate(true)
|
|
.build()
|
|
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
|
|
|
|
let response = client
|
|
.get(url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("Failed to fetch URL: {}", e))?;
|
|
|
|
let final_url = response.url().to_string();
|
|
let status = response.status();
|
|
|
|
if !status.is_success() {
|
|
return Err(format!(
|
|
"HTTP error {}: {}",
|
|
status.as_u16(),
|
|
status.canonical_reason().unwrap_or("Unknown")
|
|
));
|
|
}
|
|
|
|
// Check content type
|
|
let content_type = response
|
|
.headers()
|
|
.get("content-type")
|
|
.and_then(|v| v.to_str().ok())
|
|
.unwrap_or("text/html")
|
|
.to_lowercase();
|
|
|
|
if !is_text_content(&content_type) {
|
|
return Err(format!(
|
|
"Cannot read non-text content type: {}",
|
|
content_type
|
|
));
|
|
}
|
|
|
|
// Check content length (reject > 2MB)
|
|
if let Some(len) = response.content_length() {
|
|
if len > 2 * 1024 * 1024 {
|
|
return Err(format!("Content too large: {} bytes (max 2MB)", len));
|
|
}
|
|
}
|
|
|
|
let body = response
|
|
.text()
|
|
.await
|
|
.map_err(|e| format!("Failed to read response body: {}", e))?;
|
|
|
|
// If it's plain text or JSON, return as-is (with truncation)
|
|
if content_type.contains("json") || content_type.starts_with("text/plain") {
|
|
let content_length = body.len();
|
|
let is_truncated = content_length > max_chars;
|
|
let content = if is_truncated {
|
|
truncate_at_boundary(&body, max_chars)
|
|
} else {
|
|
body
|
|
};
|
|
|
|
return Ok(FetchResult {
|
|
url: final_url,
|
|
title: None,
|
|
description: None,
|
|
content,
|
|
content_length,
|
|
is_truncated,
|
|
});
|
|
}
|
|
|
|
// Parse HTML
|
|
let document = Html::parse_document(&body);
|
|
|
|
// Extract metadata
|
|
let title = extract_title(&document);
|
|
let description = extract_description(&document);
|
|
|
|
// Extract text content
|
|
let text = extract_text(&document);
|
|
let content_length = text.len();
|
|
let is_truncated = content_length > max_chars;
|
|
let content = if is_truncated {
|
|
truncate_at_boundary(&text, max_chars)
|
|
} else {
|
|
text
|
|
};
|
|
|
|
Ok(FetchResult {
|
|
url: final_url,
|
|
title,
|
|
description,
|
|
content,
|
|
content_length,
|
|
is_truncated,
|
|
})
|
|
}
|
|
|
|
fn is_text_content(ct: &str) -> bool {
|
|
ct.starts_with("text/")
|
|
|| ct.contains("json")
|
|
|| ct.contains("xml")
|
|
|| ct.contains("javascript")
|
|
}
|
|
|
|
fn extract_title(doc: &Html) -> Option<String> {
|
|
// Try og:title first
|
|
if let Ok(og_sel) = Selector::parse(r#"meta[property="og:title"]"#) {
|
|
if let Some(el) = doc.select(&og_sel).next() {
|
|
if let Some(content) = el.value().attr("content") {
|
|
let t = content.trim();
|
|
if !t.is_empty() {
|
|
return Some(t.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to <title>
|
|
if let Ok(title_sel) = Selector::parse("title") {
|
|
if let Some(el) = doc.select(&title_sel).next() {
|
|
let t: String = el.text().collect::<String>().trim().to_string();
|
|
if !t.is_empty() {
|
|
return Some(t);
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn extract_description(doc: &Html) -> Option<String> {
|
|
// Try og:description
|
|
if let Ok(og_sel) = Selector::parse(r#"meta[property="og:description"]"#) {
|
|
if let Some(el) = doc.select(&og_sel).next() {
|
|
if let Some(content) = el.value().attr("content") {
|
|
let d = content.trim();
|
|
if !d.is_empty() {
|
|
return Some(d.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to meta description
|
|
if let Ok(meta_sel) = Selector::parse(r#"meta[name="description"]"#) {
|
|
if let Some(el) = doc.select(&meta_sel).next() {
|
|
if let Some(content) = el.value().attr("content") {
|
|
let d = content.trim();
|
|
if !d.is_empty() {
|
|
return Some(d.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Extract text content from HTML, stripping noise elements.
|
|
fn extract_text(doc: &Html) -> String {
|
|
let mut output = String::new();
|
|
|
|
// Try to find main content area first
|
|
let main_selectors = ["main", "article", "[role=\"main\"]", "#content", ".content"];
|
|
let mut root = None;
|
|
for sel_str in &main_selectors {
|
|
if let Ok(sel) = Selector::parse(sel_str) {
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
root = Some(el);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to body
|
|
if root.is_none() {
|
|
if let Ok(body_sel) = Selector::parse("body") {
|
|
root = doc.select(&body_sel).next();
|
|
}
|
|
}
|
|
|
|
if let Some(root_el) = root {
|
|
extract_node_text(&root_el, &mut output);
|
|
}
|
|
|
|
// Clean up: collapse excessive whitespace/newlines
|
|
let mut cleaned = String::new();
|
|
let mut blank_count = 0;
|
|
for line in output.lines() {
|
|
let trimmed = line.trim();
|
|
if trimmed.is_empty() {
|
|
blank_count += 1;
|
|
if blank_count <= 2 {
|
|
cleaned.push('\n');
|
|
}
|
|
} else {
|
|
blank_count = 0;
|
|
cleaned.push_str(trimmed);
|
|
cleaned.push('\n');
|
|
}
|
|
}
|
|
|
|
cleaned.trim().to_string()
|
|
}
|
|
|
|
/// Recursively extract text from a node, skipping noise elements.
|
|
fn extract_node_text(element: &scraper::ElementRef, output: &mut String) {
|
|
for child in element.children() {
|
|
match child.value() {
|
|
scraper::node::Node::Text(text) => {
|
|
let t = text.trim();
|
|
if !t.is_empty() {
|
|
output.push_str(t);
|
|
output.push(' ');
|
|
}
|
|
}
|
|
scraper::node::Node::Element(el) => {
|
|
let tag = el.name();
|
|
|
|
// Skip noise elements entirely
|
|
if STRIP_TAGS.contains(&tag) {
|
|
continue;
|
|
}
|
|
|
|
let child_ref = scraper::ElementRef::wrap(child);
|
|
if let Some(child_el) = child_ref {
|
|
// Add newline before block elements
|
|
let is_block = BLOCK_TAGS.contains(&tag);
|
|
if is_block {
|
|
output.push('\n');
|
|
}
|
|
|
|
// Handle headings specially — add markdown-style prefix
|
|
match tag {
|
|
"h1" => output.push_str("# "),
|
|
"h2" => output.push_str("## "),
|
|
"h3" => output.push_str("### "),
|
|
"li" => output.push_str("- "),
|
|
"a" => {
|
|
// Extract link text and URL
|
|
let text: String = child_el.text().collect();
|
|
let href = el.attr("href").unwrap_or("");
|
|
if !text.trim().is_empty()
|
|
&& !href.is_empty()
|
|
&& href.starts_with("http")
|
|
{
|
|
output.push_str(&format!("[{}]({})", text.trim(), href));
|
|
output.push(' ');
|
|
continue; // Don't recurse into <a> children
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
extract_node_text(&child_el, output);
|
|
|
|
if is_block {
|
|
output.push('\n');
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Truncate content at a paragraph or line boundary.
|
|
fn truncate_at_boundary(text: &str, max_chars: usize) -> String {
|
|
if text.len() <= max_chars {
|
|
return text.to_string();
|
|
}
|
|
|
|
let slice = &text[..max_chars];
|
|
|
|
// Try to break at a double newline (paragraph boundary)
|
|
if let Some(pos) = slice.rfind("\n\n") {
|
|
return slice[..pos].to_string();
|
|
}
|
|
|
|
// Try to break at a single newline
|
|
if let Some(pos) = slice.rfind('\n') {
|
|
return slice[..pos].to_string();
|
|
}
|
|
|
|
// Try word boundary
|
|
if let Some(pos) = slice.rfind(' ') {
|
|
return slice[..pos].to_string();
|
|
}
|
|
|
|
// Last resort: hard cut
|
|
slice.to_string()
|
|
}
|
|
|
|
/// Format a fetch result into a readable string for the AI.
|
|
pub fn format_result(result: &FetchResult) -> String {
|
|
let mut output = String::new();
|
|
|
|
output.push_str(&format!("URL: {}\n", result.url));
|
|
if let Some(title) = &result.title {
|
|
output.push_str(&format!("Title: {}\n", title));
|
|
}
|
|
if let Some(desc) = &result.description {
|
|
output.push_str(&format!("Description: {}\n", desc));
|
|
}
|
|
output.push_str(&format!(
|
|
"Content length: {} characters",
|
|
result.content_length
|
|
));
|
|
if result.is_truncated {
|
|
output.push_str(" (truncated)");
|
|
}
|
|
output.push_str("\n\n");
|
|
output.push_str(&result.content);
|
|
|
|
output
|
|
}
|