From 8eebfe2646a73d42e224824de7877e8ec9f3479c Mon Sep 17 00:00:00 2001
From: chappse6 <chappse6@gmail.com>
Date: Wed, 15 Apr 2026 13:18:39 +0900
Subject: [PATCH 1/2] fix: preserve multi-byte UTF-8 across sidebar-agent
 stdout chunks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Korean, Japanese, Chinese, and emoji characters streamed from claude via
sidebar-agent were mojibake'd when a multi-byte UTF-8 code point landed
on a Buffer chunk boundary — e.g. Korean "합니다" rendered as "핣니다"
in the sidebar. Per-chunk Buffer.toString('utf8') replaces partial
sequences with U+FFFD and the next chunk starts mid-sequence, corrupting
both chunks. ASCII-only streams are unaffected (1 byte = 1 code point).

Route proc.stdout and proc.stderr through a small StringDecoder wrapper
that buffers partial code units across chunks, and flush on close.

Adds a unit test that splits Korean, Japanese, Chinese, 4-byte emoji,
and mixed text at every possible byte offset and verifies the decoded
string round-trips the original.
---
 browse/src/sidebar-agent.ts             |  9 ++-
 browse/src/utf8-stream-decoder.ts       | 24 +++++++
 browse/test/utf8-stream-decoder.test.ts | 88 +++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 browse/src/utf8-stream-decoder.ts
 create mode 100644 browse/test/utf8-stream-decoder.test.ts
diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts
index 215c717b40..e6da51cedd 100644
--- a/browse/src/sidebar-agent.ts
+++ b/browse/src/sidebar-agent.ts
@@ -13,6 +13,7 @@ import { spawn } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import { safeUnlink } from './error-handling';
+import { createUtf8StreamDecoder } from './utf8-stream-decoder';
 
 const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
 const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
@@ -331,9 +332,11 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
     }, 500);
 
     let buffer = '';
+    const stdoutDecoder = createUtf8StreamDecoder();
+    const stderrDecoder = createUtf8StreamDecoder();
 
     proc.stdout.on('data', (data: Buffer) => {
-      buffer += data.toString();
+      buffer += stdoutDecoder.write(data);
       const lines = buffer.split('\n');
       buffer = lines.pop() || '';
       for (const line of lines) {
@@ -346,13 +349,15 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
 
     let stderrBuffer = '';
     proc.stderr.on('data', (data: Buffer) => {
-      stderrBuffer += data.toString();
+      stderrBuffer += stderrDecoder.write(data);
     });
 
     proc.on('close', (code) => {
       clearInterval(cancelCheck);
       activeProc = null;
       activeProcs.delete(tid);
+      buffer += stdoutDecoder.end();
+      stderrBuffer += stderrDecoder.end();
       if (buffer.trim()) {
         try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) {
           console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message);
diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts
new file mode 100644
index 0000000000..4511d4d173
--- /dev/null
+++ b/browse/src/utf8-stream-decoder.ts
@@ -0,0 +1,24 @@
+/**
+ * UTF-8 stream decoder for child-process stdout/stderr.
+ *
+ * Per-chunk `Buffer.toString('utf8')` corrupts multi-byte characters
+ * (Korean, Japanese, Chinese, emoji, etc.) when a code-point straddles a
+ * chunk boundary — the partial bytes decode to U+FFFD and the next chunk
+ * starts mid-sequence. StringDecoder buffers partial code units across
+ * chunks so decoded strings round-trip the original bytes.
+ *
+ * Pure ASCII streams are unaffected: every byte is a complete code point.
+ */
+
+import { StringDecoder } from 'string_decoder';
+
+export function createUtf8StreamDecoder(): {
+  write(chunk: Buffer): string;
+  end(): string;
+} {
+  const decoder = new StringDecoder('utf8');
+  return {
+    write: (chunk) => decoder.write(chunk),
+    end: () => decoder.end(),
+  };
+}
diff --git a/browse/test/utf8-stream-decoder.test.ts b/browse/test/utf8-stream-decoder.test.ts
new file mode 100644
index 0000000000..38f76be142
--- /dev/null
+++ b/browse/test/utf8-stream-decoder.test.ts
@@ -0,0 +1,88 @@
+/**
+ * Regression tests for UTF-8 stream chunk-boundary handling.
+ *
+ * Background: sidebar-agent previously used per-chunk `Buffer.toString()` on
+ * `proc.stdout` data events. When claude emitted non-ASCII text (Korean,
+ * Japanese, Chinese, emoji), a multi-byte character that happened to land on
+ * a chunk boundary would be mojibake'd — e.g. Korean "합니다" rendered as
+ * "핣니다" in the sidebar. These tests pin the fix: chunks split at every
+ * byte offset must reassemble to the original string.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { createUtf8StreamDecoder } from '../src/utf8-stream-decoder';
+
+function feedSplit(input: Buffer, splitAt: number): string {
+  const decoder = createUtf8StreamDecoder();
+  let out = decoder.write(input.subarray(0, splitAt));
+  out += decoder.write(input.subarray(splitAt));
+  out += decoder.end();
+  return out;
+}
+
+describe('createUtf8StreamDecoder', () => {
+  test('preserves Korean text split at every byte boundary', () => {
+    // Each Hangul syllable is 3 bytes in UTF-8, so every non-zero offset
+    // inside a syllable is a boundary the naive .toString() fix must survive.
+    const text = '안녕하세요 합니다';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('preserves 4-byte emoji split at every byte boundary', () => {
+    const text = 'hi 👋 there 🎉 done';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('preserves Japanese and Chinese mix across chunks', () => {
+    const text = 'こんにちは 你好 世界';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('never emits the U+FFFD replacement char for valid UTF-8', () => {
+    const text = '핣'.repeat(100); // dense Korean
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).not.toContain('\uFFFD');
+    }
+  });
+
+  test('handles many tiny chunks (one byte at a time)', () => {
+    const text = 'stream 안녕 🎉 end';
+    const buf = Buffer.from(text, 'utf8');
+    const decoder = createUtf8StreamDecoder();
+    let out = '';
+    for (let i = 0; i < buf.length; i++) {
+      out += decoder.write(buf.subarray(i, i + 1));
+    }
+    out += decoder.end();
+    expect(out).toBe(text);
+  });
+
+  test('ASCII-only input is unaffected', () => {
+    const text = '{"type":"text","content":"hello world"}\n';
+    const buf = Buffer.from(text, 'utf8');
+    for (let i = 0; i <= buf.length; i++) {
+      expect(feedSplit(buf, i)).toBe(text);
+    }
+  });
+
+  test('end() flushes trailing partial sequence as replacement char', () => {
+    // If the stream truncates mid-character (process killed), end() should
+    // not hold bytes indefinitely. StringDecoder emits U+FFFD for the
+    // incomplete tail — acceptable: we prefer a visible marker over lost data.
+    const decoder = createUtf8StreamDecoder();
+    const partial = Buffer.from('안', 'utf8').subarray(0, 2); // 2 of 3 bytes
+    const mid = decoder.write(partial);
+    const flushed = decoder.end();
+    expect(mid + flushed).toContain('\uFFFD');
+  });
+});

From 869df66a6456ebcfe3d34e60fb0687af1af76f11 Mon Sep 17 00:00:00 2001
From: chappse6 <chappse6@gmail.com>
Date: Wed, 15 Apr 2026 13:40:48 +0900
Subject: [PATCH 2/2] docs: clarify rationale for decoder wrapper and
 close-flush

Address review feedback:
- Extend utf8-stream-decoder.ts JSDoc to explain why the thin wrapper
  exists (unit-testable contract vs. inline StringDecoder usage).
- Annotate the `end()` flush on process close so the intent is obvious
  without grepping StringDecoder's docs.

No behavior change.
---
 browse/src/sidebar-agent.ts       | 4 ++++
 browse/src/utf8-stream-decoder.ts | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts
index e6da51cedd..4b8ed063e4 100644
--- a/browse/src/sidebar-agent.ts
+++ b/browse/src/sidebar-agent.ts
@@ -356,6 +356,10 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
       clearInterval(cancelCheck);
       activeProc = null;
       activeProcs.delete(tid);
+      // Flush any bytes held back by the UTF-8 decoder (e.g. a trailing
+      // incomplete multi-byte sequence at end-of-stream). Normally empty;
+      // if the child exited mid-codepoint StringDecoder emits U+FFFD here
+      // rather than silently dropping the bytes.
       buffer += stdoutDecoder.end();
       stderrBuffer += stderrDecoder.end();
       if (buffer.trim()) {
diff --git a/browse/src/utf8-stream-decoder.ts b/browse/src/utf8-stream-decoder.ts
index 4511d4d173..723deb96e0 100644
--- a/browse/src/utf8-stream-decoder.ts
+++ b/browse/src/utf8-stream-decoder.ts
@@ -8,6 +8,11 @@
  * chunks so decoded strings round-trip the original bytes.
  *
  * Pure ASCII streams are unaffected: every byte is a complete code point.
+ *
+ * This is a thin wrapper around the built-in StringDecoder rather than a
+ * direct usage at the call site so the contract (write/end) can be
+ * unit-tested in isolation — see browse/test/utf8-stream-decoder.test.ts,
+ * which splits multi-byte text at every byte offset to pin the behavior.
  */
 
 import { StringDecoder } from 'string_decoder';