From 08cf7d607f304cf2c701b41ab9cf70b44f6fc167 Mon Sep 17 00:00:00 2001 From: lauren Date: Wed, 20 May 2026 23:43:06 -0700 Subject: [PATCH] [rust-compiler] Emit loc.column/index as UTF-16 code units in SWC frontend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the cluster-1 BytePos shift, `ConvertCtx::position()` emitted `loc.column` and `loc.index` as 0-based UTF-8 byte offsets. Babel emits them as 0-based UTF-16 code unit offsets (matching JS string indexing). For files containing any character above U+FFFF (e.g. an emoji like 🔴 U+1F534), the two diverge by +2 per such character because the char is 4 bytes in UTF-8 but 2 code units in UTF-16. Precompute a `utf16_offsets: Vec` table in `ConvertCtx::new` that maps each source byte index to its 0-based UTF-16 code unit offset. `position()` then looks up `index` directly and computes `column` as `index - utf16_index_of_line_start`. O(1) per call; the table costs ~4× the source length in memory, which is bounded for fixture/file inputs. Considered an alternative that walks the source line on each `position()` call to count UTF-16 code units. More memory-frugal but O(line length) per call. The precomputed table wins on O(1) lookup and the per-call cost matters because `position()` is invoked on every node, comment, and reference in the converter. Clamp the byte index in `position()` to the sentinel at `utf16_offsets.len() - 1`. Synthetic spans (e.g. compiler-generated imports given `BytePos(1)`) can point past EOF in degenerate cases; clamping avoids a panic. Line numbers stay 1-based and the binary-search remains keyed on byte offsets, since the underlying `line_offsets` table is byte-based. Fixes 4 e2e parity fixtures (3 targeted + 1 latent): - effect-derived-computations/invalid-derived-computation-in-effect.js - error.invalid-derived-computation-in-effect.js - fbt/error.todo-multiple-fbt-plural.tsx - (one additional latent fixture passes for free) Test plan: - bash compiler/scripts/test-e2e.sh --variant swc: Before: Total 1770/1795 After: Total 1774/1795 (4 fixed) - bash compiler/scripts/test-e2e.sh --variant babel: 1788/1795 (unchanged) - bash compiler/scripts/test-e2e.sh --variant oxc: 1702/1795 (unchanged) - cargo test --workspace: 56 passed, 0 failed --- .../react_compiler_swc/src/convert_ast.rs | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/compiler/crates/react_compiler_swc/src/convert_ast.rs b/compiler/crates/react_compiler_swc/src/convert_ast.rs index 319ff59e2a2..651d9f8b21f 100644 --- a/compiler/crates/react_compiler_swc/src/convert_ast.rs +++ b/compiler/crates/react_compiler_swc/src/convert_ast.rs @@ -191,19 +191,27 @@ struct ConvertCtx<'a> { #[allow(dead_code)] source_text: &'a str, line_offsets: Vec, + utf16_offsets: Vec, } impl<'a> ConvertCtx<'a> { fn new(source_text: &'a str) -> Self { let mut line_offsets = vec![0u32]; + let mut utf16_offsets = vec![0u32; source_text.len() + 1]; + let mut utf16_offset = 0u32; for (i, ch) in source_text.char_indices() { + let next = i + ch.len_utf8(); + utf16_offsets[i..next].fill(utf16_offset); + utf16_offset += ch.len_utf16() as u32; if ch == '\n' { - line_offsets.push((i + 1) as u32); + line_offsets.push(next as u32); } } + utf16_offsets[source_text.len()] = utf16_offset; Self { source_text, line_offsets, + utf16_offsets, } } @@ -222,7 +230,7 @@ impl<'a> ConvertCtx<'a> { } } - /// `BytePos` is 1-based; emit 0-based `loc` to match Babel. + /// `BytePos` is 1-based; emit 0-based UTF-16 `loc` to match Babel. /// (`BaseNode.start`/`end` stays 1-based: `convert_scope` keys on it.) fn position(&self, offset: u32) -> Position { let zero_based = offset.saturating_sub(1); @@ -231,10 +239,14 @@ impl<'a> ConvertCtx<'a> { Err(idx) => idx.saturating_sub(1), }; let line_start = self.line_offsets[line_idx]; + // Synthetic spans can point past EOF; clamp to the sentinel. + let byte_idx = (zero_based as usize).min(self.utf16_offsets.len() - 1); + let utf16_offset = self.utf16_offsets[byte_idx]; + let line_start_utf16 = self.utf16_offsets[line_start as usize]; Position { line: (line_idx as u32) + 1, - column: zero_based - line_start, - index: Some(zero_based), + column: utf16_offset - line_start_utf16, + index: Some(utf16_offset), } }