Jason Tudisco 01258fa958 feat: complete GroupChat app with AI tool calling, search, fetch, and UI
Full-stack real-time group chat with Rust/Axum backend and Riot.js frontend.

Features:
- Auth (register/login/JWT), rooms, invites, WebSocket messaging
- AI responses via OpenRouter with tool calling (Brave Search + web fetch)
- Real-time tool usage indicators (searching/reading page)
- Collapsible tool results in message bubbles
- AI stats bar (model, tokens, speed, response time) persisted to DB
- Room soft-delete, /clear command, dynamic model fetching
- Markdown rendering with code highlighting and copy buttons

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 18:50:52 -06:00

356 lines
10 KiB
Rust

use scraper::{Html, Selector};
use std::time::Duration;
/// Result of fetching and extracting content from a URL.
#[derive(Debug)]
pub struct FetchResult {
pub url: String,
pub title: Option<String>,
pub description: Option<String>,
pub content: String,
pub content_length: usize,
pub is_truncated: bool,
}
/// Tags to skip entirely during text extraction (noise elements).
const STRIP_TAGS: &[&str] = &[
"script", "style", "noscript", "svg", "iframe", "nav", "footer", "aside",
];
/// Block-level tags that should produce newlines in text output.
const BLOCK_TAGS: &[&str] = &[
"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "br", "tr",
"blockquote", "pre", "section", "article", "main", "header",
"dt", "dd", "figcaption", "table", "thead", "tbody",
];
/// Fetch a URL and extract its text content.
/// Returns structured result with metadata and cleaned content.
pub async fn fetch_url(url: &str, max_chars: usize) -> Result<FetchResult, String> {
// Validate URL
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err("URL must start with http:// or https://".into());
}
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.connect_timeout(Duration::from_secs(10))
.redirect(reqwest::redirect::Policy::limited(10))
.user_agent("GroupChat-AI/1.0 (Web Fetch Tool)")
.gzip(true)
.brotli(true)
.deflate(true)
.build()
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
let response = client
.get(url)
.send()
.await
.map_err(|e| format!("Failed to fetch URL: {}", e))?;
let final_url = response.url().to_string();
let status = response.status();
if !status.is_success() {
return Err(format!(
"HTTP error {}: {}",
status.as_u16(),
status.canonical_reason().unwrap_or("Unknown")
));
}
// Check content type
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("text/html")
.to_lowercase();
if !is_text_content(&content_type) {
return Err(format!(
"Cannot read non-text content type: {}",
content_type
));
}
// Check content length (reject > 2MB)
if let Some(len) = response.content_length() {
if len > 2 * 1024 * 1024 {
return Err(format!("Content too large: {} bytes (max 2MB)", len));
}
}
let body = response
.text()
.await
.map_err(|e| format!("Failed to read response body: {}", e))?;
// If it's plain text or JSON, return as-is (with truncation)
if content_type.contains("json") || content_type.starts_with("text/plain") {
let content_length = body.len();
let is_truncated = content_length > max_chars;
let content = if is_truncated {
truncate_at_boundary(&body, max_chars)
} else {
body
};
return Ok(FetchResult {
url: final_url,
title: None,
description: None,
content,
content_length,
is_truncated,
});
}
// Parse HTML
let document = Html::parse_document(&body);
// Extract metadata
let title = extract_title(&document);
let description = extract_description(&document);
// Extract text content
let text = extract_text(&document);
let content_length = text.len();
let is_truncated = content_length > max_chars;
let content = if is_truncated {
truncate_at_boundary(&text, max_chars)
} else {
text
};
Ok(FetchResult {
url: final_url,
title,
description,
content,
content_length,
is_truncated,
})
}
fn is_text_content(ct: &str) -> bool {
ct.starts_with("text/")
|| ct.contains("json")
|| ct.contains("xml")
|| ct.contains("javascript")
}
fn extract_title(doc: &Html) -> Option<String> {
// Try og:title first
if let Ok(og_sel) = Selector::parse(r#"meta[property="og:title"]"#) {
if let Some(el) = doc.select(&og_sel).next() {
if let Some(content) = el.value().attr("content") {
let t = content.trim();
if !t.is_empty() {
return Some(t.to_string());
}
}
}
}
// Fall back to <title>
if let Ok(title_sel) = Selector::parse("title") {
if let Some(el) = doc.select(&title_sel).next() {
let t: String = el.text().collect::<String>().trim().to_string();
if !t.is_empty() {
return Some(t);
}
}
}
None
}
fn extract_description(doc: &Html) -> Option<String> {
// Try og:description
if let Ok(og_sel) = Selector::parse(r#"meta[property="og:description"]"#) {
if let Some(el) = doc.select(&og_sel).next() {
if let Some(content) = el.value().attr("content") {
let d = content.trim();
if !d.is_empty() {
return Some(d.to_string());
}
}
}
}
// Fall back to meta description
if let Ok(meta_sel) = Selector::parse(r#"meta[name="description"]"#) {
if let Some(el) = doc.select(&meta_sel).next() {
if let Some(content) = el.value().attr("content") {
let d = content.trim();
if !d.is_empty() {
return Some(d.to_string());
}
}
}
}
None
}
/// Extract text content from HTML, stripping noise elements.
fn extract_text(doc: &Html) -> String {
let mut output = String::new();
// Try to find main content area first
let main_selectors = ["main", "article", "[role=\"main\"]", "#content", ".content"];
let mut root = None;
for sel_str in &main_selectors {
if let Ok(sel) = Selector::parse(sel_str) {
if let Some(el) = doc.select(&sel).next() {
root = Some(el);
break;
}
}
}
// Fall back to body
if root.is_none() {
if let Ok(body_sel) = Selector::parse("body") {
root = doc.select(&body_sel).next();
}
}
if let Some(root_el) = root {
extract_node_text(&root_el, &mut output);
}
// Clean up: collapse excessive whitespace/newlines
let mut cleaned = String::new();
let mut blank_count = 0;
for line in output.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
blank_count += 1;
if blank_count <= 2 {
cleaned.push('\n');
}
} else {
blank_count = 0;
cleaned.push_str(trimmed);
cleaned.push('\n');
}
}
cleaned.trim().to_string()
}
/// Recursively extract text from a node, skipping noise elements.
fn extract_node_text(element: &scraper::ElementRef, output: &mut String) {
for child in element.children() {
match child.value() {
scraper::node::Node::Text(text) => {
let t = text.trim();
if !t.is_empty() {
output.push_str(t);
output.push(' ');
}
}
scraper::node::Node::Element(el) => {
let tag = el.name();
// Skip noise elements entirely
if STRIP_TAGS.contains(&tag) {
continue;
}
let child_ref = scraper::ElementRef::wrap(child);
if let Some(child_el) = child_ref {
// Add newline before block elements
let is_block = BLOCK_TAGS.contains(&tag);
if is_block {
output.push('\n');
}
// Handle headings specially — add markdown-style prefix
match tag {
"h1" => output.push_str("# "),
"h2" => output.push_str("## "),
"h3" => output.push_str("### "),
"li" => output.push_str("- "),
"a" => {
// Extract link text and URL
let text: String = child_el.text().collect();
let href = el.attr("href").unwrap_or("");
if !text.trim().is_empty()
&& !href.is_empty()
&& href.starts_with("http")
{
output.push_str(&format!("[{}]({})", text.trim(), href));
output.push(' ');
continue; // Don't recurse into <a> children
}
}
_ => {}
}
extract_node_text(&child_el, output);
if is_block {
output.push('\n');
}
}
}
_ => {}
}
}
}
/// Truncate content at a paragraph or line boundary.
fn truncate_at_boundary(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
let slice = &text[..max_chars];
// Try to break at a double newline (paragraph boundary)
if let Some(pos) = slice.rfind("\n\n") {
return slice[..pos].to_string();
}
// Try to break at a single newline
if let Some(pos) = slice.rfind('\n') {
return slice[..pos].to_string();
}
// Try word boundary
if let Some(pos) = slice.rfind(' ') {
return slice[..pos].to_string();
}
// Last resort: hard cut
slice.to_string()
}
/// Format a fetch result into a readable string for the AI.
pub fn format_result(result: &FetchResult) -> String {
let mut output = String::new();
output.push_str(&format!("URL: {}\n", result.url));
if let Some(title) = &result.title {
output.push_str(&format!("Title: {}\n", title));
}
if let Some(desc) = &result.description {
output.push_str(&format!("Description: {}\n", desc));
}
output.push_str(&format!(
"Content length: {} characters",
result.content_length
));
if result.is_truncated {
output.push_str(" (truncated)");
}
output.push_str("\n\n");
output.push_str(&result.content);
output
}