Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions browse/src/sidebar-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { spawn } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import { safeUnlink } from './error-handling';
import { createUtf8StreamDecoder } from './utf8-stream-decoder';

const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
Expand Down Expand Up @@ -331,9 +332,11 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
}, 500);

let buffer = '';
const stdoutDecoder = createUtf8StreamDecoder();
const stderrDecoder = createUtf8StreamDecoder();

proc.stdout.on('data', (data: Buffer) => {
buffer += data.toString();
buffer += stdoutDecoder.write(data);
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) {
Expand All @@ -346,13 +349,19 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {

let stderrBuffer = '';
proc.stderr.on('data', (data: Buffer) => {
stderrBuffer += data.toString();
stderrBuffer += stderrDecoder.write(data);
});

proc.on('close', (code) => {
clearInterval(cancelCheck);
activeProc = null;
activeProcs.delete(tid);
// Flush any bytes held back by the UTF-8 decoder (e.g. a trailing
// incomplete multi-byte sequence at end-of-stream). Normally empty;
// if the child exited mid-codepoint StringDecoder emits U+FFFD here
// rather than silently dropping the bytes.
buffer += stdoutDecoder.end();
stderrBuffer += stderrDecoder.end();
if (buffer.trim()) {
try { handleStreamEvent(JSON.parse(buffer), tid); } catch (err: any) {
console.error(`[sidebar-agent] Tab ${tid}: Failed to parse final buffer:`, buffer.slice(0, 100), err.message);
Expand Down
29 changes: 29 additions & 0 deletions browse/src/utf8-stream-decoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* UTF-8 stream decoder for child-process stdout/stderr.
*
* Per-chunk `Buffer.toString('utf8')` corrupts multi-byte characters
* (Korean, Japanese, Chinese, emoji, etc.) when a code-point straddles a
* chunk boundary — the partial bytes decode to U+FFFD and the next chunk
* starts mid-sequence. StringDecoder buffers partial code units across
* chunks so decoded strings round-trip the original bytes.
*
* Pure ASCII streams are unaffected: every byte is a complete code point.
*
* This is a thin wrapper around the built-in StringDecoder rather than a
* direct usage at the call site so the contract (write/end) can be
* unit-tested in isolation — see browse/test/utf8-stream-decoder.test.ts,
* which splits multi-byte text at every byte offset to pin the behavior.
*/

import { StringDecoder } from 'string_decoder';

export function createUtf8StreamDecoder(): {
write(chunk: Buffer): string;
end(): string;
} {
const decoder = new StringDecoder('utf8');
return {
write: (chunk) => decoder.write(chunk),
end: () => decoder.end(),
};
}
88 changes: 88 additions & 0 deletions browse/test/utf8-stream-decoder.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* Regression tests for UTF-8 stream chunk-boundary handling.
*
* Background: sidebar-agent previously used per-chunk `Buffer.toString()` on
* `proc.stdout` data events. When claude emitted non-ASCII text (Korean,
* Japanese, Chinese, emoji), a multi-byte character that happened to land on
* a chunk boundary would be mojibake'd — e.g. Korean "합니다" rendered as
* "핣니다" in the sidebar. These tests pin the fix: chunks split at every
* byte offset must reassemble to the original string.
*/

import { describe, test, expect } from 'bun:test';
import { createUtf8StreamDecoder } from '../src/utf8-stream-decoder';

function feedSplit(input: Buffer, splitAt: number): string {
const decoder = createUtf8StreamDecoder();
let out = decoder.write(input.subarray(0, splitAt));
out += decoder.write(input.subarray(splitAt));
out += decoder.end();
return out;
}

describe('createUtf8StreamDecoder', () => {
test('preserves Korean text split at every byte boundary', () => {
// Each Hangul syllable is 3 bytes in UTF-8, so every non-zero offset
// inside a syllable is a boundary the naive .toString() fix must survive.
const text = '안녕하세요 합니다';
const buf = Buffer.from(text, 'utf8');
for (let i = 0; i <= buf.length; i++) {
expect(feedSplit(buf, i)).toBe(text);
}
});

test('preserves 4-byte emoji split at every byte boundary', () => {
const text = 'hi 👋 there 🎉 done';
const buf = Buffer.from(text, 'utf8');
for (let i = 0; i <= buf.length; i++) {
expect(feedSplit(buf, i)).toBe(text);
}
});

test('preserves Japanese and Chinese mix across chunks', () => {
const text = 'こんにちは 你好 世界';
const buf = Buffer.from(text, 'utf8');
for (let i = 0; i <= buf.length; i++) {
expect(feedSplit(buf, i)).toBe(text);
}
});

test('never emits the U+FFFD replacement char for valid UTF-8', () => {
const text = '핣'.repeat(100); // dense Korean
const buf = Buffer.from(text, 'utf8');
for (let i = 0; i <= buf.length; i++) {
expect(feedSplit(buf, i)).not.toContain('\uFFFD');
}
});

test('handles many tiny chunks (one byte at a time)', () => {
const text = 'stream 안녕 🎉 end';
const buf = Buffer.from(text, 'utf8');
const decoder = createUtf8StreamDecoder();
let out = '';
for (let i = 0; i < buf.length; i++) {
out += decoder.write(buf.subarray(i, i + 1));
}
out += decoder.end();
expect(out).toBe(text);
});

test('ASCII-only input is unaffected', () => {
const text = '{"type":"text","content":"hello world"}\n';
const buf = Buffer.from(text, 'utf8');
for (let i = 0; i <= buf.length; i++) {
expect(feedSplit(buf, i)).toBe(text);
}
});

test('end() flushes trailing partial sequence as replacement char', () => {
// If the stream truncates mid-character (process killed), end() should
// not hold bytes indefinitely. StringDecoder emits U+FFFD for the
// incomplete tail — acceptable: we prefer a visible marker over lost data.
const decoder = createUtf8StreamDecoder();
const partial = Buffer.from('안', 'utf8').subarray(0, 2); // 2 of 3 bytes
const mid = decoder.write(partial);
const flushed = decoder.end();
expect(mid + flushed).toContain('\uFFFD');
});
});