diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts index 215c717b4..4b8ed063e 100644 --- a/browse/src/sidebar-agent.ts +++ b/browse/src/sidebar-agent.ts @@ -13,6 +13,7 @@ import { spawn } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; import { safeUnlink } from './error-handling'; +import { createUtf8StreamDecoder } from './utf8-stream-decoder'; const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl'); const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill'); @@ -331,9 +332,11 @@ async function askClaude(queueEntry: QueueEntry): Promise { }, 500); let buffer = ''; + const stdoutDecoder = createUtf8StreamDecoder(); + const stderrDecoder = createUtf8StreamDecoder(); proc.stdout.on('data', (data: Buffer) => { - buffer += data.toString(); + buffer += stdoutDecoder.write(data); const lines = buffer.split('\n'); buffer = lines.pop() || ''; for (const line of lines) { @@ -346,13 +349,19 @@ async function askClaude(queueEntry: QueueEntry): Promise { let stderrBuffer = ''; proc.stderr.on('data', (data: Buffer) => { - stderrBuffer += data.toString(); + stderrBuffer += stderrDecoder.write(data); }); proc.on('close', (code) => { clearInterval(cancelCheck); activeProc = null; activeProcs.delete(tid); + // Flush any bytes held back by the UTF-8 decoder (e.g. a trailing + // incomplete multi-byte sequence at end-of-stream). Normally empty; + // if the child exited mid-codepoint StringDecoder emits U+FFFD here + // rather than silently dropping the bytes. + buffer += stdoutDecoder.end(); + stderrBuffer += stderrDecoder.end(); if (buffer.trim()) { try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) { console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message); diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts new file mode 100644 index 000000000..723deb96e --- /dev/null +++ b/browse/src/utf8-stream-decoder.ts @@ -0,0 +1,29 @@ +/** + * UTF-8 stream decoder for child-process stdout/stderr. + * + * Per-chunk `Buffer.toString('utf8')` corrupts multi-byte characters + * (Korean, Japanese, Chinese, emoji, etc.) when a code-point straddles a + * chunk boundary — the partial bytes decode to U+FFFD and the next chunk + * starts mid-sequence. StringDecoder buffers partial code units across + * chunks so decoded strings round-trip the original bytes. + * + * Pure ASCII streams are unaffected: every byte is a complete code point. + * + * This is a thin wrapper around the built-in StringDecoder rather than a + * direct usage at the call site so the contract (write/end) can be + * unit-tested in isolation — see browse/test/utf8-stream-decoder.test.ts, + * which splits multi-byte text at every byte offset to pin the behavior. + */ + +import { StringDecoder } from 'string_decoder'; + +export function createUtf8StreamDecoder(): { + write(chunk: Buffer): string; + end(): string; +} { + const decoder = new StringDecoder('utf8'); + return { + write: (chunk) => decoder.write(chunk), + end: () => decoder.end(), + }; +} diff --git a/browse/test/utf8-stream-decoder.test.ts b/browse/test/utf8-stream-decoder.test.ts new file mode 100644 index 000000000..38f76be14 --- /dev/null +++ b/browse/test/utf8-stream-decoder.test.ts @@ -0,0 +1,88 @@ +/** + * Regression tests for UTF-8 stream chunk-boundary handling. + * + * Background: sidebar-agent previously used per-chunk `Buffer.toString()` on + * `proc.stdout` data events. When claude emitted non-ASCII text (Korean, + * Japanese, Chinese, emoji), a multi-byte character that happened to land on + * a chunk boundary would be mojibake'd — e.g. Korean "합니다" rendered as + * "핣니다" in the sidebar. These tests pin the fix: chunks split at every + * byte offset must reassemble to the original string. + */ + +import { describe, test, expect } from 'bun:test'; +import { createUtf8StreamDecoder } from '../src/utf8-stream-decoder'; + +function feedSplit(input: Buffer, splitAt: number): string { + const decoder = createUtf8StreamDecoder(); + let out = decoder.write(input.subarray(0, splitAt)); + out += decoder.write(input.subarray(splitAt)); + out += decoder.end(); + return out; +} + +describe('createUtf8StreamDecoder', () => { + test('preserves Korean text split at every byte boundary', () => { + // Each Hangul syllable is 3 bytes in UTF-8, so every non-zero offset + // inside a syllable is a boundary the naive .toString() fix must survive. + const text = '안녕하세요 합니다'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('preserves 4-byte emoji split at every byte boundary', () => { + const text = 'hi 👋 there 🎉 done'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('preserves Japanese and Chinese mix across chunks', () => { + const text = 'こんにちは 你好 世界'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('never emits the U+FFFD replacement char for valid UTF-8', () => { + const text = '핣'.repeat(100); // dense Korean + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).not.toContain('\uFFFD'); + } + }); + + test('handles many tiny chunks (one byte at a time)', () => { + const text = 'stream 안녕 🎉 end'; + const buf = Buffer.from(text, 'utf8'); + const decoder = createUtf8StreamDecoder(); + let out = ''; + for (let i = 0; i < buf.length; i++) { + out += decoder.write(buf.subarray(i, i + 1)); + } + out += decoder.end(); + expect(out).toBe(text); + }); + + test('ASCII-only input is unaffected', () => { + const text = '{"type":"text","content":"hello world"}\n'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('end() flushes trailing partial sequence as replacement char', () => { + // If the stream truncates mid-character (process killed), end() should + // not hold bytes indefinitely. StringDecoder emits U+FFFD for the + // incomplete tail — acceptable: we prefer a visible marker over lost data. + const decoder = createUtf8StreamDecoder(); + const partial = Buffer.from('안', 'utf8').subarray(0, 2); // 2 of 3 bytes + const mid = decoder.write(partial); + const flushed = decoder.end(); + expect(mid + flushed).toContain('\uFFFD'); + }); +});