garrytan · chappse6 · Apr 15, 2026 · Apr 15, 2026
diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts
@@ -13,6 +13,7 @@ import { spawn } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import { safeUnlink } from './error-handling';
+import { createUtf8StreamDecoder } from './utf8-stream-decoder';
 
 const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
 const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
@@ -331,9 +332,11 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
     }, 500);
 
     let buffer = '';
+    const stdoutDecoder = createUtf8StreamDecoder();
+    const stderrDecoder = createUtf8StreamDecoder();
 
     proc.stdout.on('data', (data: Buffer) => {
-      buffer += data.toString();
+      buffer += stdoutDecoder.write(data);
       const lines = buffer.split('\n');
       buffer = lines.pop() || '';
       for (const line of lines) {
@@ -346,13 +349,19 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
 
     let stderrBuffer = '';
     proc.stderr.on('data', (data: Buffer) => {
-      stderrBuffer += data.toString();
+      stderrBuffer += stderrDecoder.write(data);
     });
 
     proc.on('close', (code) => {
       clearInterval(cancelCheck);
       activeProc = null;
       activeProcs.delete(tid);
+      // Flush any bytes held back by the UTF-8 decoder (e.g. a trailing
+      // incomplete multi-byte sequence at end-of-stream). Normally empty;
+      // if the child exited mid-codepoint StringDecoder emits U+FFFD here
+      // rather than silently dropping the bytes.
+      buffer += stdoutDecoder.end();
+      stderrBuffer += stderrDecoder.end();
       if (buffer.trim()) {
         try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) {
           console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message);

diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts
@@ -0,0 +1,29 @@
+/**
+ * UTF-8 stream decoder for child-process stdout/stderr.
+ *
+ * Per-chunk `Buffer.toString('utf8')` corrupts multi-byte characters
+ * (Korean, Japanese, Chinese, emoji, etc.) when a code-point straddles a
+ * chunk boundary — the partial bytes decode to U+FFFD and the next chunk
+ * starts mid-sequence. StringDecoder buffers partial code units across
+ * chunks so decoded strings round-trip the original bytes.
+ *
+ * Pure ASCII streams are unaffected: every byte is a complete code point.
+ *
+ * This is a thin wrapper around the built-in StringDecoder rather than a
+ * direct usage at the call site so the contract (write/end) can be
+ * unit-tested in isolation — see browse/test/utf8-stream-decoder.test.ts,
+ * which splits multi-byte text at every byte offset to pin the behavior.
+ */
+
+import { StringDecoder } from 'string_decoder';
+
+export function createUtf8StreamDecoder(): {
+  write(chunk: Buffer): string;
+  end(): string;
+} {
+  const decoder = new StringDecoder('utf8');
+  return {
+    write: (chunk) => decoder.write(chunk),
+    end: () => decoder.end(),
+  };
+}
diff --git a/browse/test/utf8-stream-decoder.test.ts b/browse/test/utf8-stream-decoder.test.ts
@@ -0,0 +1,88 @@
+/**
+ * Regression tests for UTF-8 stream chunk-boundary handling.
+ *
+ * Background: sidebar-agent previously used per-chunk `Buffer.toString()` on
+ * `proc.stdout` data events. When claude emitted non-ASCII text (Korean,
+ * Japanese, Chinese, emoji), a multi-byte character that happened to land on
+ * a chunk boundary would be mojibake'd — e.g. Korean "합니다" rendered as
+ * "핣니다" in the sidebar. These tests pin the fix: chunks split at every
+ * byte offset must reassemble to the original string.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { createUtf8StreamDecoder } from '../src/utf8-stream-decoder';
+
+function feedSplit(input: Buffer, splitAt: number): string {
+  const decoder = createUtf8StreamDecoder();
+  let out = decoder.write(input.subarray(0, splitAt));
+  out += decoder.write(input.subarray(splitAt));
+  out += decoder.end();
+  return out;
+}
+
+describe('createUtf8StreamDecoder', () => {
+  test('preserves Korean text split at every byte boundary', () => {
+    // Each Hangul syllable is 3 bytes in UTF-8, so every non-zero offset
+    // inside a syllable is a boundary the naive .toString() fix must survive.
+    const text = '안녕하세요 합니다';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('preserves 4-byte emoji split at every byte boundary', () => {
+    const text = 'hi 👋 there 🎉 done';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('preserves Japanese and Chinese mix across chunks', () => {
+    const text = 'こんにちは 你好 世界';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('never emits the U+FFFD replacement char for valid UTF-8', () => {
+    const text = '핣'.repeat(100); // dense Korean
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).not.toContain('\uFFFD');
+    }
+  });
+
+  test('handles many tiny chunks (one byte at a time)', () => {
+    const text = 'stream 안녕 🎉 end';
+    const buf = Buffer.from(text, 'utf8');
+    const decoder = createUtf8StreamDecoder();
+    let out = '';
+    for (let i = 0; i < buf.length; i++) {
+      out += decoder.write(buf.subarray(i, i + 1));
+    }
+    out += decoder.end();
+    expect(out).toBe(text);
+  });
+
+  test('ASCII-only input is unaffected', () => {
+    const text = '{"type":"text","content":"hello world"}\n';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('end() flushes trailing partial sequence as replacement char', () => {
+    // If the stream truncates mid-character (process killed), end() should
+    // not hold bytes indefinitely. StringDecoder emits U+FFFD for the
+    // incomplete tail — acceptable: we prefer a visible marker over lost data.
+    const decoder = createUtf8StreamDecoder();
+    const partial = Buffer.from('안', 'utf8').subarray(0, 2); // 2 of 3 bytes
+    const mid = decoder.write(partial);
+    const flushed = decoder.end();
+    expect(mid + flushed).toContain('\uFFFD');
+  });
+});