From ba4be9f86c7296dd40305b67fb9862797a983d3e Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Tue, 25 Feb 2025 18:47:17 +0000
Subject: [PATCH] perf(ast/estree): speed up building UTF8-UTF16 translation
 table with SIMD

---
 crates/oxc_ast/src/utf8_to_utf16.rs | 228 +++++++++++++++++++++++-----
 1 file changed, 193 insertions(+), 35 deletions(-)
diff --git a/crates/oxc_ast/src/utf8_to_utf16.rs b/crates/oxc_ast/src/utf8_to_utf16.rs
index 19764c776326b8..b9875a212ad778 100644
--- a/crates/oxc_ast/src/utf8_to_utf16.rs
+++ b/crates/oxc_ast/src/utf8_to_utf16.rs
@@ -1,6 +1,6 @@
 //! Convert UTF-8 span offsets to UTF-16.
 
-use std::cmp::min;
+use std::{cmp::min, slice};
 
 use oxc_span::Span;
 use oxc_syntax::module_record::{ModuleRecord, VisitMutModuleRecord};
@@ -33,40 +33,10 @@ impl Utf8ToUtf16 {
     /// Create new [`Utf8ToUtf16`] conversion table from source text.
     pub fn new(source_text: &str) -> Self {
         let mut translations = Vec::with_capacity(16);
+
         translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });
 
-        // Translation from UTF-8 byte offset to UTF-16 char offset:
-        //
-        // * 1-byte UTF-8 sequence
-        //   = 1st byte 0xxxxxxx (0 - 0x7F)
-        //   -> 1 x UTF-16 char
-        //   UTF-16 len = UTF-8 len
-        // * 2-byte UTF-8 sequence
-        //   = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
-        //   -> 1 x UTF-16
-        //   UTF-16 len = UTF-8 len - 1
-        // * 3-byte UTF-8 sequence
-        //   = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
-        //   -> 1 x UTF-16
-        //   UTF-16 len = UTF-8 len - 2
-        // * 4-byte UTF-8 sequence
-        //   = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
-        //   -> 2 x UTF-16
-        //   UTF-16 len = UTF-8 len - 2
-        //
-        // So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
-        let mut utf16_difference = 0;
-        #[expect(clippy::cast_possible_truncation)]
-        for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
-            if byte >= 0xC0 {
-                let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
-                utf16_difference += difference_for_this_byte;
-                // Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
-                // Unicode character that need to be shifted
-                translations
-                    .push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
-            }
-        }
+        build_translations(source_text, &mut translations);
 
         // If no translations have been added after the first `0, 0` dummy, then source is entirely ASCII.
         // Remove the dummy entry.
@@ -377,6 +347,151 @@ impl VisitMutModuleRecord for Utf8ToUtf16Converter<'_> {
     }
 }
 
+const CHUNK_SIZE: usize = 32;
+const CHUNK_ALIGNMENT: usize = align_of::<AlignedChunk>();
+const _: () = {
+    assert!(CHUNK_SIZE >= CHUNK_ALIGNMENT);
+    assert!(CHUNK_SIZE % CHUNK_ALIGNMENT == 0);
+};
+
+#[repr(C, align(16))]
+struct AlignedChunk([u8; CHUNK_SIZE]);
+
+impl AlignedChunk {
+    // This boils down to 3 x SIMD ops to check 32 bytes for Unicode bytes in one go.
+    // https://godbolt.org/z/e3rGd8sEa
+    #[inline]
+    fn contains_unicode(&self) -> bool {
+        for index in 0..self.0.len() {
+            if self.0[index] >= 128 {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+/// Build table of translations from UTF-8 offsets to UTF-16 offsets.
+///
+/// Process bulk of source text in chunks of 32 bytes, using SIMD instructions.
+/// This should be much faster than byte-by-byte processing, assuming non-ASCII chars are rare in source code.
+///
+/// Translation as follows:
+///
+/// * 1-byte UTF-8 sequence
+///   = 1st byte 0xxxxxxx (0 - 0x7F)
+///   -> 1 x UTF-16 char
+///   UTF-16 len = UTF-8 len
+/// * 2-byte UTF-8 sequence
+///   = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+///   -> 1 x UTF-16
+///   UTF-16 len = UTF-8 len - 1
+/// * 3-byte UTF-8 sequence
+///   = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+///   -> 1 x UTF-16
+///   UTF-16 len = UTF-8 len - 2
+/// * 4-byte UTF-8 sequence
+///   = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+///   -> 2 x UTF-16
+///   UTF-16 len = UTF-8 len - 2
+///
+/// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
+fn build_translations(source_text: &str, translations: &mut Vec<Translation>) {
+    // Running counter of difference between UTF-8 and UTF-16 offset
+    let mut utf16_difference = 0;
+
+    // Closure that processes a slice of bytes.
+    #[expect(clippy::cast_possible_truncation)]
+    let mut process_slice = |slice: &[u8], start_offset: usize| {
+        for (index, &byte) in slice.iter().enumerate() {
+            if byte >= 0xC0 {
+                let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
+                utf16_difference += difference_for_this_byte;
+                // Record `offset + 1` not `offset`, because it's only offsets *after* this
+                // Unicode character that need to be shifted
+                let offset = (start_offset + index) as u32;
+                translations.push(Translation { utf8_offset: offset + 1, utf16_difference });
+            }
+        }
+    };
+
+    // If source text is short, just process byte-by-byte
+    let bytes = source_text.as_bytes();
+    if bytes.len() < CHUNK_SIZE {
+        process_slice(bytes, 0);
+        return;
+    }
+
+    // Process first few bytes of source
+    let start_ptr = bytes.as_ptr();
+    let mut ptr = start_ptr;
+
+    let first_chunk_bytes = start_ptr.align_offset(CHUNK_ALIGNMENT);
+    if first_chunk_bytes > 0 {
+        // SAFETY: `first_chunk_bytes` is less than `CHUNK_ALIGNMENT`, which in turn is no bigger than
+        // `CHUNK_SIZE`. We already exited if source is shorter than `CHUNK_SIZE` bytes, so
+        // there must be at least `first_chunk_bytes` bytes in source.
+        let first_chunk = unsafe { slice::from_raw_parts(ptr, first_chunk_bytes) };
+        process_slice(first_chunk, 0);
+        // SAFETY: For reasons given above, `first_chunk_bytes` must be in bounds
+        ptr = unsafe { ptr.add(first_chunk_bytes) };
+    }
+
+    // Process main body as aligned chunks of 32 bytes.
+    //
+    // We've aligned `ptr` to `CHUNK_ALIGNMENT`, so can now read the rest of source as `AlignedChunk`s
+    // (apart from a few bytes on end which may not be enough to make a whole `AlignedChunk`).
+    //
+    // Do a fast check for any non-ASCII bytes in each chunk using SIMD.
+    // Only if there are some non-ASCII bytes, process the chunk byte-by-byte.
+
+    let remaining_bytes = bytes.len() - first_chunk_bytes;
+    let body_bytes = remaining_bytes & !(CHUNK_SIZE - 1);
+    // SAFETY: `body_bytes` is less than number of bytes remaining in `bytes`, so in bounds
+    let end_ptr = unsafe { ptr.add(body_bytes) };
+    let last_chunk_bytes = remaining_bytes - body_bytes;
+
+    while ptr < end_ptr {
+        // SAFETY: `ptr` was aligned to `CHUNK_ALIGNMENT` after processing 1st chunk.
+        // It is incremented in this loop by `CHUNK_SIZE`, which is a multiple of `CHUNK_ALIGNMENT`,
+        // so `ptr` remains always aligned for `CHUNK_ALIGNMENT`.
+        // `ptr < end_ptr` check ensures it's valid to read `CHUNK_SIZE` bytes starting at `ptr`.
+        #[expect(clippy::cast_ptr_alignment)]
+        let chunk = unsafe { ptr.cast::<AlignedChunk>().as_ref().unwrap_unchecked() };
+        if chunk.contains_unicode() {
+            // SAFETY: `ptr` is equal to or after `start_ptr`. Both are within bounds of `bytes`.
+            // `ptr` is derived from `start_ptr`.
+            let offset = unsafe { offset_from(ptr, start_ptr) };
+            process_slice(&chunk.0, offset);
+        }
+
+        ptr = unsafe { ptr.add(CHUNK_SIZE) };
+    }
+
+    // Process last chunk
+    if last_chunk_bytes > 0 {
+        // SAFETY: `ptr + last_chunk_bytes` goes up to end of `bytes`.
+        // `bytes` is a `&[u8]` so guaranteed initialized and valid for reads.
+        let last_chunk = unsafe { slice::from_raw_parts(ptr, last_chunk_bytes) };
+        // SAFETY: `ptr` is after `start_ptr`. Both are within bounds of `bytes`.
+        // `ptr` is derived from `start_ptr`.
+        let offset = unsafe { offset_from(ptr, start_ptr) };
+        process_slice(last_chunk, offset);
+    }
+}
+
+/// Calculate distance in bytes from `from_ptr` to `to_ptr`.
+///
+/// # SAFETY
+/// * `from_ptr` must be before or equal to `to_ptr`.
+/// * Both pointers must point to within the same object.
+/// * Both pointers must be derived from the same original pointer.
+#[inline]
+unsafe fn offset_from(to_ptr: *const u8, from_ptr: *const u8) -> usize {
+    let offset = unsafe { to_ptr.offset_from(from_ptr) };
+    unsafe { usize::try_from(offset).unwrap_unchecked() }
+}
+
 #[cfg(test)]
 mod test {
     use oxc_allocator::Allocator;
@@ -459,12 +574,55 @@ mod test {
             ("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
         ];
 
+        // Convert cases to `Vec`
+        let mut cases_vec = cases
+            .iter()
+            .map(|&(text, translations)| (text, translations.to_vec()))
+            .collect::<Vec<_>>();
+
+        // Create 1 long string containing 99 repeats of each test case, concatenated
+        let repeats = 99u32;
+        let mut texts = String::new();
+        for (text, _) in cases {
+            for _i in 0..repeats {
+                texts.push_str(text);
+            }
+        }
+
+        // Generate more test cases for each of the defined cases repeated 99 times.
+        // Each case references a slice of the large `texts` string.
+        // Reason we do that is so that these string slices have uneven alignments, to exercise all parts
+        // of `build_translations`, which handles unaligned header/tail differently from the main body
+        // of the source text.
+        // The number of repeats is 99, for the same reason - to ensure each string slice begins at
+        // a memory address which is not evenly aligned.
+        let mut offset = 0;
         for &(text, translations) in cases {
+            let end_offset = offset + text.len() * (repeats as usize);
+            let repeated_text = &texts[offset..end_offset];
+
+            let (len_utf8, len_utf16) = *translations.last().unwrap();
+            assert_eq!(text.len(), len_utf8 as usize);
+
+            let mut repeated_translations = vec![];
+            for i in 0..repeats {
+                for &(offset_utf8, offset_utf16) in translations {
+                    repeated_translations
+                        .push((offset_utf8 + len_utf8 * i, offset_utf16 + len_utf16 * i));
+                }
+            }
+
+            cases_vec.push((repeated_text, repeated_translations));
+
+            offset = end_offset;
+        }
+
+        for (text, translations) in cases_vec {
             let table = Utf8ToUtf16::new(text);
             let converter = table.converter();
             if let Some(mut converter) = converter {
                 // Iterate in forwards order
-                for &(utf8_offset, expected_utf16_offset) in translations {
+                for &(utf8_offset, expected_utf16_offset) in &translations {
                     let mut utf16_offset = utf8_offset;
                     converter.convert_offset(&mut utf16_offset);
                     assert_eq!(utf16_offset, expected_utf16_offset);
@@ -478,7 +636,7 @@ mod test {
                 }
             } else {
                 // No Unicode chars. All offsets should be the same.
-                for &(utf8_offset, expected_utf16_offset) in translations {
+                for &(utf8_offset, expected_utf16_offset) in &translations {
                     assert_eq!(utf8_offset, expected_utf16_offset);
                 }
             }