feat: add streaming AI responses with smooth token-by-token rendering

Replaces batch AI responses with real-time SSE streaming from OpenRouter. Tokens are buffered client-side and drained via requestAnimationFrame for a smooth typing effect instead of choppy chunk dumps. Backend: - Rewrite openrouter service for SSE streaming with incremental tool call accumulation - Add AiStreamChunk/AiStreamEnd WebSocket event types - Stream content deltas to clients during all tool call rounds - Increase broadcast channel capacity (256 -> 4096) and handle Lagged errors gracefully Frontend: - Add StreamBuffer utility with adaptive rAF-based character draining - Show streaming message-bubble with blinking cursor during generation - Clean up buffer on room switch and final message replacement Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 20:23:49 -06:00 · 2026-03-06 20:23:49 -06:00 · 4a002c85d4
commit 4a002c85d4
parent 01258fa958
8 changed files with 533 additions and 156 deletions
--- a/client/src/components/app.riot
+++ b/client/src/components/app.riot
@ -26,6 +26,7 @@
            ai-typing={state.aiTyping}
            ai-tool-status={state.aiToolStatus}
            typing-users={state.typingUsers}
+            streaming-message={state.streamingMessage}
            cb-send={sendMessage}
            cb-invite={() => update({ showInviteModal: true })}
            cb-delete-room={() => update({ showDeleteModal: true })}
@ -142,6 +143,7 @@
  <script>
    import { api, saveAuth, getUser, clearAuth, isAuthenticated } from '../services/api.js'
    import { ws } from '../services/websocket.js'
+    import { StreamBuffer } from '../services/stream-buffer.js'

    export default {
      state: {
@ -157,6 +159,7 @@
        showClearModal: false,
        aiTyping: false,
        aiToolStatus: null,
+        streamingMessage: null,
        typingUsers: [],
      },

@ -178,15 +181,76 @@

        ws.on('new_message', (msg) => {
          if (msg.message.room_id === this.state.activeRoomId) {
+            // If we were streaming this message, cancel the buffer and remove placeholder
+            const isStreamReplacement = this.state.streamingMessage?.id === msg.message.id
+            if (isStreamReplacement && this.streamBuffer) {
+              this.streamBuffer.cancel()
+              this.streamBuffer = null
+              this._streamMsgId = null
+              this._streamContent = ''
+            }
            this.update({
              messages: [...this.state.messages, msg.message],
              aiTyping: false,
              aiToolStatus: null,
+              streamingMessage: isStreamReplacement ? null : this.state.streamingMessage,
            })
            this.scrollToBottom()
          }
        })

+        ws.on('ai_stream_chunk', (msg) => {
+          if (msg.room_id === this.state.activeRoomId) {
+            if (!this.streamBuffer || this._streamMsgId !== msg.message_id) {
+              // First chunk for a new message — create buffer + streaming message
+              if (this.streamBuffer) this.streamBuffer.cancel()
+              this._streamMsgId = msg.message_id
+              this._streamContent = ''
+
+              this.streamBuffer = new StreamBuffer(
+                // onText: drip chars into the displayed message
+                (text) => {
+                  this._streamContent += text
+                  this.update({
+                    streamingMessage: {
+                      id: this._streamMsgId,
+                      room_id: msg.room_id,
+                      sender_id: 'ai-assistant',
+                      sender_name: 'AI Assistant',
+                      content: this._streamContent,
+                      mentions: [],
+                      is_ai: true,
+                      streaming: true,
+                    },
+                  })
+                  this.scrollToBottom()
+                },
+                // onDone: buffer fully drained after stream ended
+                () => {
+                  if (this.state.streamingMessage?.id === this._streamMsgId) {
+                    this.update({
+                      streamingMessage: { ...this.state.streamingMessage, streaming: false },
+                    })
+                  }
+                }
+              )
+
+              this.update({ aiTyping: false })
+            }
+            // Push every chunk into the buffer (it drains smoothly via rAF)
+            this.streamBuffer.push(msg.delta)
+          }
+        })
+
+        ws.on('ai_stream_end', (msg) => {
+          if (msg.room_id === this.state.activeRoomId) {
+            // Tell buffer to flush remaining text, then signal done
+            if (this.streamBuffer && this._streamMsgId === msg.message_id) {
+              this.streamBuffer.finish()
+            }
+          }
+        })
+
        ws.on('ai_typing', (msg) => {
          if (msg.room_id === this.state.activeRoomId) {
            this.update({ aiTyping: true })
@ -259,6 +323,13 @@

      async selectRoom(roomId) {
        try {
+          // Cancel any active stream buffer when switching rooms
+          if (this.streamBuffer) {
+            this.streamBuffer.cancel()
+            this.streamBuffer = null
+            this._streamMsgId = null
+            this._streamContent = ''
+          }
          const [room, messages] = await Promise.all([
            api.getRoom(roomId),
            api.getMessages(roomId),
@ -269,6 +340,7 @@
            messages,
            aiTyping: false,
            aiToolStatus: null,
+            streamingMessage: null,
            typingUsers: [],
          })
          ws.joinRoom(roomId)
--- a/client/src/components/chat-room.riot
+++ b/client/src/components/chat-room.riot
@ -40,7 +40,17 @@
        />
      </div>

-      <div if={props.aiTyping} class="typing-indicator ai-typing">
+      <!-- Streaming AI message (live content) -->
+      <div if={props.streamingMessage} key="streaming">
+        <message-bubble
+          message={props.streamingMessage}
+          is-own={false}
+          is-streaming={true}
+        />
+      </div>
+
+      <!-- AI typing indicator (only when NOT streaming content) -->
+      <div if={props.aiTyping && !props.streamingMessage} class="typing-indicator ai-typing">
        <div class="typing-avatar ai-avatar">AI</div>
        <template if={props.aiToolStatus}>
          <span class="tool-status-text">
--- a/client/src/components/message-bubble.riot
+++ b/client/src/components/message-bubble.riot
@ -25,7 +25,8 @@
          </div>
        </div>
      </div>
-      <div class="message-content markdown-content"></div>
+      <div if={props.isStreaming} class="message-content streaming-content">{props.message?.content}<span class="streaming-cursor">▌</span></div>
+      <div if={!props.isStreaming} class="message-content markdown-content"></div>
      <div if={props.message?.is_ai && props.message?.ai_meta} class="ai-stats-bar">
        <button class="ai-stat-btn" onclick={copyFullMessage} title="Copy response">
          <svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
@ -280,6 +281,22 @@
      color: var(--accent);
      font-weight: 500;
    }
+
+    .streaming-content {
+      white-space: pre-wrap;
+      word-wrap: break-word;
+    }
+
+    .streaming-cursor {
+      animation: cursor-blink 0.8s step-end infinite;
+      color: var(--accent);
+      font-weight: 300;
+    }
+
+    @keyframes cursor-blink {
+      0%, 100% { opacity: 1; }
+      50% { opacity: 0; }
+    }
  </style>

  <script>
@ -295,7 +312,8 @@
      },

      renderContent() {
-        const el = this.$('.message-content')
+        if (this.props.isStreaming) return // Don't markdown-render while streaming
+        const el = this.$('.message-content.markdown-content')
        if (el && this.props.message?.content) {
          el.innerHTML = renderMarkdown(this.props.message.content)
          // Inject copy buttons into code blocks
--- a/client/src/services/stream-buffer.js
+++ b/client/src/services/stream-buffer.js
@ -0,0 +1,92 @@
+/**
+ * StreamBuffer — smooth token-by-token rendering for LLM streams.
+ *
+ * Network delivers tokens in bursts (5-10 at once, then a pause).
+ * This buffer queues incoming text and drains it at a steady rate
+ * via requestAnimationFrame, creating the smooth "typing" effect
+ * seen in ChatGPT / Claude.ai.
+ */
+export class StreamBuffer {
+  /**
+   * @param {(text: string) => void} onText  — called each frame with chars to append
+   * @param {() => void} onDone              — called when buffer fully drained after finish()
+   */
+  constructor(onText, onDone) {
+    this.queue = ''
+    this.onText = onText
+    this.onDone = onDone
+    this.rafId = null
+    this.finished = false       // stream has ended, flush remaining
+    this.baseSpeed = 3          // chars per frame at 60fps (~180 chars/sec)
+  }
+
+  /** Push new text from a stream chunk into the buffer. */
+  push(text) {
+    this.queue += text
+    if (!this.rafId) {
+      this.startDrain()
+    }
+  }
+
+  /** Signal that the stream is complete — flush remaining text quickly. */
+  finish() {
+    this.finished = true
+    if (!this.rafId && this.queue.length > 0) {
+      this.startDrain()
+    }
+  }
+
+  /** Cancel the buffer (e.g. room switch). */
+  cancel() {
+    if (this.rafId) {
+      cancelAnimationFrame(this.rafId)
+      this.rafId = null
+    }
+    this.queue = ''
+    this.finished = false
+  }
+
+  /** @private */
+  startDrain() {
+    this.rafId = requestAnimationFrame(() => this.drain())
+  }
+
+  /** @private */
+  drain() {
+    if (this.queue.length === 0) {
+      this.rafId = null
+      if (this.finished) {
+        this.onDone()
+      }
+      return
+    }
+
+    // Adaptive speed:
+    //  - Base: ~3 chars/frame (smooth typing feel)
+    //  - If buffer > 50 chars: speed up to avoid falling behind
+    //  - If stream ended: flush fast (10+ chars/frame)
+    let chars = this.baseSpeed
+
+    if (this.queue.length > 200) {
+      // Very behind — catch up aggressively
+      chars = Math.ceil(this.queue.length / 10)
+    } else if (this.queue.length > 50) {
+      // Moderately behind — speed up proportionally
+      chars = Math.ceil(this.queue.length / 15)
+    }
+
+    if (this.finished) {
+      // Stream done, flush remaining smoothly but quickly
+      chars = Math.max(chars, Math.ceil(this.queue.length / 8))
+    }
+
+    // Don't break in the middle of a multi-byte character or word
+    // (slice is safe for most LLM output which is ASCII/simple unicode)
+    const chunk = this.queue.slice(0, chars)
+    this.queue = this.queue.slice(chars)
+
+    this.onText(chunk)
+
+    this.rafId = requestAnimationFrame(() => this.drain())
+  }
+}
--- a/server/src/handlers/ws.rs
+++ b/server/src/handlers/ws.rs
@ -52,7 +52,9 @@ async fn handle_socket(socket: WebSocket, state: Arc<AppState>, user_id: String,

    // Task: forward broadcast events to this client
    let mut send_task = tokio::spawn(async move {
-        while let Ok(event) = broadcast_rx.recv().await {
+        loop {
+            match broadcast_rx.recv().await {
+                Ok(event) => {
                    let rooms = rooms_clone.lock().await;
                    if rooms.contains(&event.room_id) {
                        let msg = serde_json::to_string(&event.message).unwrap();
@ -61,6 +63,16 @@ async fn handle_socket(socket: WebSocket, state: Arc<AppState>, user_id: String,
                        }
                    }
                }
+                Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
+                    tracing::warn!("WS subscriber lagged, skipped {} messages", n);
+                    // Continue receiving — don't drop the connection
+                    continue;
+                }
+                Err(tokio::sync::broadcast::error::RecvError::Closed) => {
+                    break;
+                }
+            }
+        }
    });

    let state_clone = state.clone();
@ -224,7 +236,10 @@ async fn handle_send_message(
    // Build tools for AI
    let tools = openrouter::build_tools();

-    // Call OpenRouter with tool loop
+    // Pre-generate AI message ID so we can reference it in stream chunks
+    let ai_msg_id = Uuid::new_v4().to_string();
+
+    // Call OpenRouter with tool loop — uses streaming for all rounds
    let mut total_prompt_tokens: u32 = 0;
    let mut total_completion_tokens: u32 = 0;
    let mut total_response_ms: u64 = 0;
@ -233,8 +248,8 @@ async fn handle_send_message(
    let mut had_error = false;
    let mut collected_tool_results: Vec<crate::models::ToolResult> = vec![];

-    for round in 0..MAX_TOOL_ROUNDS {
-        let result = openrouter::chat_completion(
+    'tool_loop: for round in 0..MAX_TOOL_ROUNDS {
+        let mut stream_rx = openrouter::chat_completion_stream(
            chat_history.clone(),
            &model_id,
            &state.openrouter_key,
@ -242,17 +257,21 @@ async fn handle_send_message(
        )
        .await;

-        match result {
-            Ok(openrouter::ChatCompletionResult::Response(text, stats)) => {
-                // Final text response — done!
-                total_prompt_tokens += stats.prompt_tokens;
-                total_completion_tokens += stats.completion_tokens;
-                total_response_ms += stats.response_ms;
-                final_model = stats.model;
-                ai_response = text;
-                break;
+        while let Some(event) = stream_rx.recv().await {
+            match event {
+                openrouter::StreamEvent::Delta(text) => {
+                    // Broadcast each content chunk to clients
+                    let _ = state.tx.send(BroadcastEvent {
+                        room_id: room_id.to_string(),
+                        message: WsServerMessage::AiStreamChunk {
+                            room_id: room_id.to_string(),
+                            message_id: ai_msg_id.clone(),
+                            delta: text.clone(),
+                        },
+                    });
+                    ai_response.push_str(&text);
                }
-            Ok(openrouter::ChatCompletionResult::ToolCalls(assistant_msg, stats)) => {
+                openrouter::StreamEvent::ToolCalls(assistant_msg, stats) => {
                    total_prompt_tokens += stats.prompt_tokens;
                    total_completion_tokens += stats.completion_tokens;
                    total_response_ms += stats.response_ms;
@ -270,7 +289,6 @@ async fn handle_send_message(

                    // Execute each tool call and add results
                    for tool_call in &tool_calls {
-                    // Extract tool input for display purposes
                        let tool_input = extract_tool_input(&tool_call.function.name, &tool_call.function.arguments);

                        // Broadcast real-time tool usage event
@ -296,14 +314,12 @@ async fn handle_send_message(
                            tool_result.len()
                        );

-                    // Collect tool result for inclusion in final message
                        collected_tool_results.push(crate::models::ToolResult {
                            tool: tool_call.function.name.clone(),
                            input: tool_input,
                            result: tool_result.clone(),
                        });

-                    // Add tool result to history
                        chat_history.push(openrouter::ChatMessage {
                            role: "tool".into(),
                            content: Some(tool_result),
@ -311,13 +327,22 @@ async fn handle_send_message(
                            tool_call_id: Some(tool_call.id.clone()),
                        });
                    }
-                // Loop continues — call OpenRouter again with tool results
+                    // Continue to next round (tool loop)
+                    continue 'tool_loop;
                }
-            Err(e) => {
-                tracing::error!("OpenRouter error (round {}): {}", round + 1, e);
+                openrouter::StreamEvent::Done(stats) => {
+                    total_prompt_tokens += stats.prompt_tokens;
+                    total_completion_tokens += stats.completion_tokens;
+                    total_response_ms += stats.response_ms;
+                    final_model = stats.model;
+                    break 'tool_loop;
+                }
+                openrouter::StreamEvent::Error(e) => {
+                    tracing::error!("OpenRouter stream error (round {}): {}", round + 1, e);
                    ai_response = format!("*Sorry, I encountered an error: {}*", e);
                    had_error = true;
-                break;
+                    break 'tool_loop;
+                }
            }
        }
    }
@ -327,6 +352,15 @@ async fn handle_send_message(
        ai_response = "*I used several tools but couldn't formulate a final response. Please try again.*".to_string();
    }

+    // Signal stream end so client can finalize rendering
+    let _ = state.tx.send(BroadcastEvent {
+        room_id: room_id.to_string(),
+        message: WsServerMessage::AiStreamEnd {
+            room_id: room_id.to_string(),
+            message_id: ai_msg_id.clone(),
+        },
+    });
+
    let ai_meta = if !had_error {
        Some(crate::models::AiMeta {
            model: final_model,
@ -345,7 +379,6 @@ async fn handle_send_message(
    };

    // Store AI response
-    let ai_msg_id = Uuid::new_v4().to_string();
    let ai_now = chrono::Utc::now().to_rfc3339();

    // Serialize ai_meta for database storage
@ -364,7 +397,7 @@ async fn handle_send_message(
    .execute(&state.db)
    .await;

-    // Broadcast AI message
+    // Broadcast final AI message (includes full content + ai_meta)
    let ai_payload = MessagePayload {
        id: ai_msg_id,
        room_id: room_id.to_string(),
--- a/server/src/main.rs
+++ b/server/src/main.rs
@ -72,7 +72,7 @@ async fn main() {

    tracing::info!("Database initialized");

-    let (tx, _rx) = broadcast::channel::<models::BroadcastEvent>(256);
+    let (tx, _rx) = broadcast::channel::<models::BroadcastEvent>(4096);

    let state = Arc::new(AppState {
        db,
--- a/server/src/models/mod.rs
+++ b/server/src/models/mod.rs
@ -164,6 +164,17 @@ pub enum WsServerMessage {
        tool_name: String,
        status: String,
    },
+    #[serde(rename = "ai_stream_chunk")]
+    AiStreamChunk {
+        room_id: String,
+        message_id: String,
+        delta: String,
+    },
+    #[serde(rename = "ai_stream_end")]
+    AiStreamEnd {
+        room_id: String,
+        message_id: String,
+    },
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/server/src/services/openrouter.rs
+++ b/server/src/services/openrouter.rs
@ -1,3 +1,4 @@
+use futures::StreamExt;
 use serde::{Deserialize, Serialize};

 const OPENROUTER_API_URL: &str = "https://openrouter.ai/api/v1/chat/completions";
@ -11,6 +12,8 @@ struct ChatRequest {
    max_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    tools: Option<Vec<Tool>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    stream: Option<bool>,
 }

 #[derive(Debug, Serialize, Deserialize, Clone)]
@ -55,29 +58,62 @@ pub struct ToolCallFunction {
 // ── Response types ──

 #[derive(Debug, Deserialize)]
-struct ChatResponse {
-    choices: Vec<Choice>,
+struct Usage {
+    prompt_tokens: Option<u32>,
+    completion_tokens: Option<u32>,
+    total_tokens: Option<u32>,
+}
+
+// ── Streaming response types ──
+
+#[derive(Debug, Deserialize)]
+struct StreamChunk {
+    choices: Option<Vec<StreamChoice>>,
    model: Option<String>,
    usage: Option<Usage>,
 }

 #[derive(Debug, Deserialize)]
-struct Choice {
-    message: ChoiceMessage,
+#[allow(dead_code)]
+struct StreamChoice {
+    delta: Option<StreamDelta>,
+    finish_reason: Option<String>,
 }

 #[derive(Debug, Deserialize)]
-struct ChoiceMessage {
+struct StreamDelta {
    content: Option<String>,
    #[serde(default)]
-    tool_calls: Option<Vec<ToolCall>>,
+    tool_calls: Option<Vec<StreamToolCall>>,
 }

 #[derive(Debug, Deserialize)]
-struct Usage {
-    prompt_tokens: Option<u32>,
-    completion_tokens: Option<u32>,
-    total_tokens: Option<u32>,
+#[allow(dead_code)]
+struct StreamToolCall {
+    index: Option<usize>,
+    id: Option<String>,
+    #[serde(default)]
+    r#type: Option<String>,
+    function: Option<StreamToolCallFunction>,
+}
+
+#[derive(Debug, Deserialize)]
+struct StreamToolCallFunction {
+    name: Option<String>,
+    arguments: Option<String>,
+}
+
+/// Events emitted during a streaming completion.
+#[derive(Debug)]
+pub enum StreamEvent {
+    /// A chunk of content text.
+    Delta(String),
+    /// AI wants to call tools (streaming accumulated the full tool_calls).
+    ToolCalls(ChatMessage, CompletionStats),
+    /// Stream finished with final stats.
+    Done(CompletionStats),
+    /// An error occurred.
+    Error(String),
 }

 /// Stats returned alongside an AI completion.
@ -90,14 +126,6 @@ pub struct CompletionStats {
    pub response_ms: u64,
 }

-/// Result from a chat completion — either a final text response or tool calls.
-pub enum ChatCompletionResult {
-    /// AI responded with text content.
-    Response(String, CompletionStats),
-    /// AI wants to call tools. Contains the assistant message (with tool_calls) and stats.
-    ToolCalls(ChatMessage, CompletionStats),
-}
-
 /// Build the tool definitions for brave_search and web_fetch.
 pub fn build_tools() -> Vec<Tool> {
    vec![
@ -142,26 +170,37 @@ pub fn build_tools() -> Vec<Tool> {
    ]
 }

-/// Send a chat completion request to OpenRouter.
-/// Returns either a text response or tool call requests.
-pub async fn chat_completion(
+/// Send a streaming chat completion request to OpenRouter.
+/// Sends events via the returned mpsc receiver:
+///   - `StreamEvent::Delta(text)` for each content chunk
+///   - `StreamEvent::ToolCalls(msg, stats)` if the AI wants to call tools
+///   - `StreamEvent::Done(stats)` when the stream finishes with a text response
+///   - `StreamEvent::Error(msg)` on error
+pub async fn chat_completion_stream(
    history: Vec<ChatMessage>,
    model_id: &str,
    api_key: &str,
    tools: Option<Vec<Tool>>,
-) -> Result<ChatCompletionResult, String> {
+) -> tokio::sync::mpsc::Receiver<StreamEvent> {
+    let (tx, rx) = tokio::sync::mpsc::channel::<StreamEvent>(256);
+
+    let model_id = model_id.to_string();
+    let api_key = api_key.to_string();
+
+    tokio::spawn(async move {
        let client = reqwest::Client::new();

        let request_body = ChatRequest {
-        model: model_id.to_string(),
+            model: model_id.clone(),
            messages: history,
            max_tokens: Some(2048),
            tools,
+            stream: Some(true),
        };

        let start = std::time::Instant::now();

-    let response = client
+        let response = match client
            .post(OPENROUTER_API_URL)
            .header("Authorization", format!("Bearer {}", api_key))
            .header("Content-Type", "application/json")
@ -170,57 +209,159 @@ pub async fn chat_completion(
            .json(&request_body)
            .send()
            .await
-        .map_err(|e| format!("OpenRouter request failed: {}", e))?;
-
-    let elapsed_ms = start.elapsed().as_millis() as u64;
+        {
+            Ok(r) => r,
+            Err(e) => {
+                let _ = tx.send(StreamEvent::Error(format!("Request failed: {}", e))).await;
+                return;
+            }
+        };

        if !response.status().is_success() {
            let status = response.status();
            let body = response.text().await.unwrap_or_default();
-        return Err(format!("OpenRouter error {}: {}", status, body));
+            let _ = tx.send(StreamEvent::Error(format!("OpenRouter error {}: {}", status, body))).await;
+            return;
        }

-    let chat_response: ChatResponse = response
-        .json()
-        .await
-        .map_err(|e| format!("Failed to parse OpenRouter response: {}", e))?;
+        // Read SSE stream
+        let mut byte_stream = response.bytes_stream();
+        let mut buffer = String::new();
+        let mut full_content = String::new();
+        let mut model_name = model_id.clone();

-    let choice = chat_response
-        .choices
-        .first()
-        .ok_or_else(|| "No response from OpenRouter".to_string())?;
+        // Accumulators for streamed tool calls
+        let mut tool_call_accum: Vec<ToolCall> = Vec::new();
+        let mut has_tool_calls = false;

-    let usage = chat_response.usage.unwrap_or(Usage {
-        prompt_tokens: None,
-        completion_tokens: None,
-        total_tokens: None,
+        // Usage from the final chunk (some providers include it)
+        let mut final_usage: Option<Usage> = None;
+
+        while let Some(chunk_result) = byte_stream.next().await {
+            let bytes = match chunk_result {
+                Ok(b) => b,
+                Err(e) => {
+                    let _ = tx.send(StreamEvent::Error(format!("Stream error: {}", e))).await;
+                    return;
+                }
+            };
+
+            buffer.push_str(&String::from_utf8_lossy(&bytes));
+
+            // Process complete SSE lines
+            while let Some(line_end) = buffer.find('\n') {
+                let line = buffer[..line_end].trim().to_string();
+                buffer = buffer[line_end + 1..].to_string();
+
+                if line.is_empty() || line.starts_with(':') {
+                    continue;
+                }
+
+                if let Some(data) = line.strip_prefix("data: ") {
+                    let data = data.trim();
+
+                    if data == "[DONE]" {
+                        // Stream finished
+                        continue;
+                    }
+
+                    let chunk: StreamChunk = match serde_json::from_str(data) {
+                        Ok(c) => c,
+                        Err(_) => continue,
+                    };
+
+                    if let Some(m) = &chunk.model {
+                        model_name = m.clone();
+                    }
+
+                    if let Some(u) = chunk.usage {
+                        final_usage = Some(u);
+                    }
+
+                    if let Some(choices) = &chunk.choices {
+                        if let Some(choice) = choices.first() {
+                            if let Some(delta) = &choice.delta {
+                                // Handle content delta
+                                if let Some(content) = &delta.content {
+                                    if !content.is_empty() {
+                                        full_content.push_str(content);
+                                        let _ = tx.send(StreamEvent::Delta(content.clone())).await;
+                                    }
+                                }
+
+                                // Handle tool call deltas (accumulate incrementally)
+                                if let Some(tcs) = &delta.tool_calls {
+                                    has_tool_calls = true;
+                                    for tc in tcs {
+                                        let idx = tc.index.unwrap_or(0);
+
+                                        // Ensure we have enough slots
+                                        while tool_call_accum.len() <= idx {
+                                            tool_call_accum.push(ToolCall {
+                                                id: String::new(),
+                                                r#type: "function".to_string(),
+                                                function: ToolCallFunction {
+                                                    name: String::new(),
+                                                    arguments: String::new(),
+                                                },
                                            });
+                                        }
+
+                                        if let Some(id) = &tc.id {
+                                            tool_call_accum[idx].id = id.clone();
+                                        }
+                                        if let Some(func) = &tc.function {
+                                            if let Some(name) = &func.name {
+                                                tool_call_accum[idx].function.name.push_str(name);
+                                            }
+                                            if let Some(args) = &func.arguments {
+                                                tool_call_accum[idx].function.arguments.push_str(args);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        let elapsed_ms = start.elapsed().as_millis() as u64;
+
+        let (prompt_tokens, completion_tokens, total_tokens) = match &final_usage {
+            Some(u) => (
+                u.prompt_tokens.unwrap_or(0),
+                u.completion_tokens.unwrap_or(0),
+                u.total_tokens.unwrap_or(0),
+            ),
+            None => (0, 0, 0),
+        };

        let stats = CompletionStats {
-        model: chat_response.model.unwrap_or_else(|| model_id.to_string()),
-        prompt_tokens: usage.prompt_tokens.unwrap_or(0),
-        completion_tokens: usage.completion_tokens.unwrap_or(0),
-        total_tokens: usage.total_tokens.unwrap_or(0),
+            model: model_name,
+            prompt_tokens,
+            completion_tokens,
+            total_tokens,
            response_ms: elapsed_ms,
        };

-    // Check if the AI wants to call tools
-    if let Some(tool_calls) = &choice.message.tool_calls {
-        if !tool_calls.is_empty() {
-            // Return the assistant message with tool calls so it can be added to history
+        if has_tool_calls && !tool_call_accum.is_empty() {
+            // AI requested tool calls
            let assistant_msg = ChatMessage {
                role: "assistant".into(),
-                content: choice.message.content.clone(),
-                tool_calls: Some(tool_calls.clone()),
+                content: if full_content.is_empty() { None } else { Some(full_content) },
+                tool_calls: Some(tool_call_accum),
                tool_call_id: None,
            };
-            return Ok(ChatCompletionResult::ToolCalls(assistant_msg, stats));
-        }
+            let _ = tx.send(StreamEvent::ToolCalls(assistant_msg, stats)).await;
+        } else {
+            // Normal text response completed
+            let _ = tx.send(StreamEvent::Done(stats)).await;
        }
+    });

-    // Regular text response
-    let content = choice.message.content.clone().unwrap_or_default();
-    Ok(ChatCompletionResult::Response(content, stats))
+    rx
 }

 /// Build the message history for OpenRouter from stored messages.