From 08cf7d607f304cf2c701b41ab9cf70b44f6fc167 Mon Sep 17 00:00:00 2001
From: lauren <lauren@anysphere.co>
Date: Wed, 20 May 2026 23:43:06 -0700
Subject: [PATCH] [rust-compiler] Emit loc.column/index as UTF-16 code units in
 SWC frontend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the cluster-1 BytePos shift, `ConvertCtx::position()` emitted
`loc.column` and `loc.index` as 0-based UTF-8 byte offsets. Babel emits
them as 0-based UTF-16 code unit offsets (matching JS string indexing).
For files containing any character above U+FFFF (e.g. an emoji like
🔴 U+1F534), the two diverge by +2 per such character because the
char is 4 bytes in UTF-8 but 2 code units in UTF-16.

Precompute a `utf16_offsets: Vec<u32>` table in `ConvertCtx::new`
that maps each source byte index to its 0-based UTF-16 code unit
offset. `position()` then looks up `index` directly and computes
`column` as `index - utf16_index_of_line_start`. O(1) per call; the
table costs ~4× the source length in memory, which is bounded for
fixture/file inputs.

Considered an alternative that walks the source line on each
`position()` call to count UTF-16 code units. More memory-frugal but
O(line length) per call. The precomputed table wins on O(1) lookup
and the per-call cost matters because `position()` is invoked on
every node, comment, and reference in the converter.

Clamp the byte index in `position()` to the sentinel at
`utf16_offsets.len() - 1`. Synthetic spans (e.g. compiler-generated
imports given `BytePos(1)`) can point past EOF in degenerate cases;
clamping avoids a panic.

Line numbers stay 1-based and the binary-search remains keyed on
byte offsets, since the underlying `line_offsets` table is byte-based.

Fixes 4 e2e parity fixtures (3 targeted + 1 latent):
- effect-derived-computations/invalid-derived-computation-in-effect.js
- error.invalid-derived-computation-in-effect.js
- fbt/error.todo-multiple-fbt-plural.tsx
- (one additional latent fixture passes for free)

Test plan:
- bash compiler/scripts/test-e2e.sh --variant swc:
    Before: Total 1770/1795
    After:  Total 1774/1795 (4 fixed)
- bash compiler/scripts/test-e2e.sh --variant babel: 1788/1795 (unchanged)
- bash compiler/scripts/test-e2e.sh --variant oxc:   1702/1795 (unchanged)
- cargo test --workspace: 56 passed, 0 failed
---
 .../react_compiler_swc/src/convert_ast.rs     | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/compiler/crates/react_compiler_swc/src/convert_ast.rs b/compiler/crates/react_compiler_swc/src/convert_ast.rs
index 319ff59e2a2..651d9f8b21f 100644
--- a/compiler/crates/react_compiler_swc/src/convert_ast.rs
+++ b/compiler/crates/react_compiler_swc/src/convert_ast.rs
@@ -191,19 +191,27 @@ struct ConvertCtx<'a> {
     #[allow(dead_code)]
     source_text: &'a str,
     line_offsets: Vec<u32>,
+    utf16_offsets: Vec<u32>,
 }
 
 impl<'a> ConvertCtx<'a> {
     fn new(source_text: &'a str) -> Self {
         let mut line_offsets = vec![0u32];
+        let mut utf16_offsets = vec![0u32; source_text.len() + 1];
+        let mut utf16_offset = 0u32;
         for (i, ch) in source_text.char_indices() {
+            let next = i + ch.len_utf8();
+            utf16_offsets[i..next].fill(utf16_offset);
+            utf16_offset += ch.len_utf16() as u32;
             if ch == '\n' {
-                line_offsets.push((i + 1) as u32);
+                line_offsets.push(next as u32);
             }
         }
+        utf16_offsets[source_text.len()] = utf16_offset;
         Self {
             source_text,
             line_offsets,
+            utf16_offsets,
         }
     }
 
@@ -222,7 +230,7 @@ impl<'a> ConvertCtx<'a> {
         }
     }
 
-    /// `BytePos` is 1-based; emit 0-based `loc` to match Babel.
+    /// `BytePos` is 1-based; emit 0-based UTF-16 `loc` to match Babel.
     /// (`BaseNode.start`/`end` stays 1-based: `convert_scope` keys on it.)
     fn position(&self, offset: u32) -> Position {
         let zero_based = offset.saturating_sub(1);
@@ -231,10 +239,14 @@ impl<'a> ConvertCtx<'a> {
             Err(idx) => idx.saturating_sub(1),
         };
         let line_start = self.line_offsets[line_idx];
+        // Synthetic spans can point past EOF; clamp to the sentinel.
+        let byte_idx = (zero_based as usize).min(self.utf16_offsets.len() - 1);
+        let utf16_offset = self.utf16_offsets[byte_idx];
+        let line_start_utf16 = self.utf16_offsets[line_start as usize];
         Position {
             line: (line_idx as u32) + 1,
-            column: zero_based - line_start,
-            index: Some(zero_based),
+            column: utf16_offset - line_start_utf16,
+            index: Some(utf16_offset),
         }
     }