From 8eebfe2646a73d42e224824de7877e8ec9f3479c Mon Sep 17 00:00:00 2001 From: chappse6 Date: Wed, 15 Apr 2026 13:18:39 +0900 Subject: [PATCH 1/2] fix: preserve multi-byte UTF-8 across sidebar-agent stdout chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Korean, Japanese, Chinese, and emoji characters streamed from claude via sidebar-agent were mojibake'd when a multi-byte UTF-8 code point landed on a Buffer chunk boundary — e.g. Korean "합니다" rendered as "핣니다" in the sidebar. Per-chunk Buffer.toString('utf8') replaces partial sequences with U+FFFD and the next chunk starts mid-sequence, corrupting both chunks. ASCII-only streams are unaffected (1 byte = 1 code point). Route proc.stdout and proc.stderr through a small StringDecoder wrapper that buffers partial code units across chunks, and flush on close. Adds a unit test that splits Korean, Japanese, Chinese, 4-byte emoji, and mixed text at every possible byte offset and verifies the decoded string round-trips the original. --- browse/src/sidebar-agent.ts | 9 ++- browse/src/utf8-stream-decoder.ts | 24 +++++++ browse/test/utf8-stream-decoder.test.ts | 88 +++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 browse/src/utf8-stream-decoder.ts create mode 100644 browse/test/utf8-stream-decoder.test.ts diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts index 215c717b40..e6da51cedd 100644 --- a/browse/src/sidebar-agent.ts +++ b/browse/src/sidebar-agent.ts @@ -13,6 +13,7 @@ import { spawn } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; import { safeUnlink } from './error-handling'; +import { createUtf8StreamDecoder } from './utf8-stream-decoder'; const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl'); const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill'); @@ -331,9 +332,11 @@ async function askClaude(queueEntry: QueueEntry): Promise { }, 500); let buffer = ''; + const stdoutDecoder = createUtf8StreamDecoder(); + const stderrDecoder = createUtf8StreamDecoder(); proc.stdout.on('data', (data: Buffer) => { - buffer += data.toString(); + buffer += stdoutDecoder.write(data); const lines = buffer.split('\n'); buffer = lines.pop() || ''; for (const line of lines) { @@ -346,13 +349,15 @@ async function askClaude(queueEntry: QueueEntry): Promise { let stderrBuffer = ''; proc.stderr.on('data', (data: Buffer) => { - stderrBuffer += data.toString(); + stderrBuffer += stderrDecoder.write(data); }); proc.on('close', (code) => { clearInterval(cancelCheck); activeProc = null; activeProcs.delete(tid); + buffer += stdoutDecoder.end(); + stderrBuffer += stderrDecoder.end(); if (buffer.trim()) { try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) { console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message); diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts new file mode 100644 index 0000000000..4511d4d173 --- /dev/null +++ b/browse/src/utf8-stream-decoder.ts @@ -0,0 +1,24 @@ +/** + * UTF-8 stream decoder for child-process stdout/stderr. + * + * Per-chunk `Buffer.toString('utf8')` corrupts multi-byte characters + * (Korean, Japanese, Chinese, emoji, etc.) when a code-point straddles a + * chunk boundary — the partial bytes decode to U+FFFD and the next chunk + * starts mid-sequence. StringDecoder buffers partial code units across + * chunks so decoded strings round-trip the original bytes. + * + * Pure ASCII streams are unaffected: every byte is a complete code point. + */ + +import { StringDecoder } from 'string_decoder'; + +export function createUtf8StreamDecoder(): { + write(chunk: Buffer): string; + end(): string; +} { + const decoder = new StringDecoder('utf8'); + return { + write: (chunk) => decoder.write(chunk), + end: () => decoder.end(), + }; +} diff --git a/browse/test/utf8-stream-decoder.test.ts b/browse/test/utf8-stream-decoder.test.ts new file mode 100644 index 0000000000..38f76be142 --- /dev/null +++ b/browse/test/utf8-stream-decoder.test.ts @@ -0,0 +1,88 @@ +/** + * Regression tests for UTF-8 stream chunk-boundary handling. + * + * Background: sidebar-agent previously used per-chunk `Buffer.toString()` on + * `proc.stdout` data events. When claude emitted non-ASCII text (Korean, + * Japanese, Chinese, emoji), a multi-byte character that happened to land on + * a chunk boundary would be mojibake'd — e.g. Korean "합니다" rendered as + * "핣니다" in the sidebar. These tests pin the fix: chunks split at every + * byte offset must reassemble to the original string. + */ + +import { describe, test, expect } from 'bun:test'; +import { createUtf8StreamDecoder } from '../src/utf8-stream-decoder'; + +function feedSplit(input: Buffer, splitAt: number): string { + const decoder = createUtf8StreamDecoder(); + let out = decoder.write(input.subarray(0, splitAt)); + out += decoder.write(input.subarray(splitAt)); + out += decoder.end(); + return out; +} + +describe('createUtf8StreamDecoder', () => { + test('preserves Korean text split at every byte boundary', () => { + // Each Hangul syllable is 3 bytes in UTF-8, so every non-zero offset + // inside a syllable is a boundary the naive .toString() fix must survive. + const text = '안녕하세요 합니다'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('preserves 4-byte emoji split at every byte boundary', () => { + const text = 'hi 👋 there 🎉 done'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('preserves Japanese and Chinese mix across chunks', () => { + const text = 'こんにちは 你好 世界'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('never emits the U+FFFD replacement char for valid UTF-8', () => { + const text = '핣'.repeat(100); // dense Korean + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).not.toContain('\uFFFD'); + } + }); + + test('handles many tiny chunks (one byte at a time)', () => { + const text = 'stream 안녕 🎉 end'; + const buf = Buffer.from(text, 'utf8'); + const decoder = createUtf8StreamDecoder(); + let out = ''; + for (let i = 0; i < buf.length; i++) { + out += decoder.write(buf.subarray(i, i + 1)); + } + out += decoder.end(); + expect(out).toBe(text); + }); + + test('ASCII-only input is unaffected', () => { + const text = '{"type":"text","content":"hello world"}\n'; + const buf = Buffer.from(text, 'utf8'); + for (let i = 0; i <= buf.length; i++) { + expect(feedSplit(buf, i)).toBe(text); + } + }); + + test('end() flushes trailing partial sequence as replacement char', () => { + // If the stream truncates mid-character (process killed), end() should + // not hold bytes indefinitely. StringDecoder emits U+FFFD for the + // incomplete tail — acceptable: we prefer a visible marker over lost data. + const decoder = createUtf8StreamDecoder(); + const partial = Buffer.from('안', 'utf8').subarray(0, 2); // 2 of 3 bytes + const mid = decoder.write(partial); + const flushed = decoder.end(); + expect(mid + flushed).toContain('\uFFFD'); + }); +}); From 869df66a6456ebcfe3d34e60fb0687af1af76f11 Mon Sep 17 00:00:00 2001 From: chappse6 Date: Wed, 15 Apr 2026 13:40:48 +0900 Subject: [PATCH 2/2] docs: clarify rationale for decoder wrapper and close-flush Address review feedback: - Extend utf8-stream-decoder.ts JSDoc to explain why the thin wrapper exists (unit-testable contract vs. inline StringDecoder usage). - Annotate the `end()` flush on process close so the intent is obvious without grepping StringDecoder's docs. No behavior change. --- browse/src/sidebar-agent.ts | 4 ++++ browse/src/utf8-stream-decoder.ts | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts index e6da51cedd..4b8ed063e4 100644 --- a/browse/src/sidebar-agent.ts +++ b/browse/src/sidebar-agent.ts @@ -356,6 +356,10 @@ async function askClaude(queueEntry: QueueEntry): Promise { clearInterval(cancelCheck); activeProc = null; activeProcs.delete(tid); + // Flush any bytes held back by the UTF-8 decoder (e.g. a trailing + // incomplete multi-byte sequence at end-of-stream). Normally empty; + // if the child exited mid-codepoint StringDecoder emits U+FFFD here + // rather than silently dropping the bytes. buffer += stdoutDecoder.end(); stderrBuffer += stderrDecoder.end(); if (buffer.trim()) { diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts index 4511d4d173..723deb96e0 100644 --- a/browse/src/utf8-stream-decoder.ts +++ b/browse/src/utf8-stream-decoder.ts @@ -8,6 +8,11 @@ * chunks so decoded strings round-trip the original bytes. * * Pure ASCII streams are unaffected: every byte is a complete code point. + * + * This is a thin wrapper around the built-in StringDecoder rather than a + * direct usage at the call site so the contract (write/end) can be + * unit-tested in isolation — see browse/test/utf8-stream-decoder.test.ts, + * which splits multi-byte text at every byte offset to pin the behavior. */ import { StringDecoder } from 'string_decoder';