From ba4be9f86c7296dd40305b67fb9862797a983d3e Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Tue, 25 Feb 2025 18:47:17 +0000 Subject: [PATCH] perf(ast/estree): speed up building UTF8-UTF16 translation table with SIMD --- crates/oxc_ast/src/utf8_to_utf16.rs | 228 +++++++++++++++++++++++----- 1 file changed, 193 insertions(+), 35 deletions(-) diff --git a/crates/oxc_ast/src/utf8_to_utf16.rs b/crates/oxc_ast/src/utf8_to_utf16.rs index 19764c776326b8..b9875a212ad778 100644 --- a/crates/oxc_ast/src/utf8_to_utf16.rs +++ b/crates/oxc_ast/src/utf8_to_utf16.rs @@ -1,6 +1,6 @@ //! Convert UTF-8 span offsets to UTF-16. -use std::cmp::min; +use std::{cmp::min, slice}; use oxc_span::Span; use oxc_syntax::module_record::{ModuleRecord, VisitMutModuleRecord}; @@ -33,40 +33,10 @@ impl Utf8ToUtf16 { /// Create new [`Utf8ToUtf16`] conversion table from source text. pub fn new(source_text: &str) -> Self { let mut translations = Vec::with_capacity(16); + translations.push(Translation { utf8_offset: 0, utf16_difference: 0 }); - // Translation from UTF-8 byte offset to UTF-16 char offset: - // - // * 1-byte UTF-8 sequence - // = 1st byte 0xxxxxxx (0 - 0x7F) - // -> 1 x UTF-16 char - // UTF-16 len = UTF-8 len - // * 2-byte UTF-8 sequence - // = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF) - // -> 1 x UTF-16 - // UTF-16 len = UTF-8 len - 1 - // * 3-byte UTF-8 sequence - // = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF) - // -> 1 x UTF-16 - // UTF-16 len = UTF-8 len - 2 - // * 4-byte UTF-8 sequence - // = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF) - // -> 2 x UTF-16 - // UTF-16 len = UTF-8 len - 2 - // - // So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0` - let mut utf16_difference = 0; - #[expect(clippy::cast_possible_truncation)] - for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() { - if byte >= 0xC0 { - let difference_for_this_byte = u32::from(byte >= 0xE0) + 1; - utf16_difference += difference_for_this_byte; - // Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this - // Unicode character that need to be shifted - translations - .push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference }); - } - } + build_translations(source_text, &mut translations); // If no translations have been added after the first `0, 0` dummy, then source is entirely ASCII. // Remove the dummy entry. @@ -377,6 +347,151 @@ impl VisitMutModuleRecord for Utf8ToUtf16Converter<'_> { } } +const CHUNK_SIZE: usize = 32; +const CHUNK_ALIGNMENT: usize = align_of::(); +const _: () = { + assert!(CHUNK_SIZE >= CHUNK_ALIGNMENT); + assert!(CHUNK_SIZE % CHUNK_ALIGNMENT == 0); +}; + +#[repr(C, align(16))] +struct AlignedChunk([u8; CHUNK_SIZE]); + +impl AlignedChunk { + // This boils down to 3 x SIMD ops to check 32 bytes for Unicode bytes in one go. + // https://godbolt.org/z/e3rGd8sEa + #[inline] + fn contains_unicode(&self) -> bool { + for index in 0..self.0.len() { + if self.0[index] >= 128 { + return true; + } + } + false + } +} + +/// Build table of translations from UTF-8 offsets to UTF-16 offsets. +/// +/// Process bulk of source text in chunks of 32 bytes, using SIMD instructions. +/// This should be much faster than byte-by-byte processing, assuming non-ASCII chars are rare in source code. +/// +/// Translation as follows: +/// +/// * 1-byte UTF-8 sequence +/// = 1st byte 0xxxxxxx (0 - 0x7F) +/// -> 1 x UTF-16 char +/// UTF-16 len = UTF-8 len +/// * 2-byte UTF-8 sequence +/// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF) +/// -> 1 x UTF-16 +/// UTF-16 len = UTF-8 len - 1 +/// * 3-byte UTF-8 sequence +/// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF) +/// -> 1 x UTF-16 +/// UTF-16 len = UTF-8 len - 2 +/// * 4-byte UTF-8 sequence +/// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF) +/// -> 2 x UTF-16 +/// UTF-16 len = UTF-8 len - 2 +/// +/// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0` +fn build_translations(source_text: &str, translations: &mut Vec) { + // Running counter of difference between UTF-8 and UTF-16 offset + let mut utf16_difference = 0; + + // Closure that processes a slice of bytes. + #[expect(clippy::cast_possible_truncation)] + let mut process_slice = |slice: &[u8], start_offset: usize| { + for (index, &byte) in slice.iter().enumerate() { + if byte >= 0xC0 { + let difference_for_this_byte = u32::from(byte >= 0xE0) + 1; + utf16_difference += difference_for_this_byte; + // Record `offset + 1` not `offset`, because it's only offsets *after* this + // Unicode character that need to be shifted + let offset = (start_offset + index) as u32; + translations.push(Translation { utf8_offset: offset + 1, utf16_difference }); + } + } + }; + + // If source text is short, just process byte-by-byte + let bytes = source_text.as_bytes(); + if bytes.len() < CHUNK_SIZE { + process_slice(bytes, 0); + return; + } + + // Process first few bytes of source + let start_ptr = bytes.as_ptr(); + let mut ptr = start_ptr; + + let first_chunk_bytes = start_ptr.align_offset(CHUNK_ALIGNMENT); + if first_chunk_bytes > 0 { + // SAFETY: `first_chunk_bytes` is less than `CHUNK_ALIGNMENT`, which in turn is no bigger than + // `CHUNK_SIZE`. We already exited if source is shorter than `CHUNK_SIZE` bytes, so + // there must be at least `first_chunk_bytes` bytes in source. + let first_chunk = unsafe { slice::from_raw_parts(ptr, first_chunk_bytes) }; + process_slice(first_chunk, 0); + // SAFETY: For reasons given above, `first_chunk_bytes` must be in bounds + ptr = unsafe { ptr.add(first_chunk_bytes) }; + } + + // Process main body as aligned chunks of 32 bytes. + // + // We've aligned `ptr` to `CHUNK_ALIGNMENT`, so can now read the rest of source as `AlignedChunk`s + // (apart from a few bytes on end which may not be enough to make a whole `AlignedChunk`). + // + // Do a fast check for any non-ASCII bytes in each chunk using SIMD. + // Only if there are some non-ASCII bytes, process the chunk byte-by-byte. + + let remaining_bytes = bytes.len() - first_chunk_bytes; + let body_bytes = remaining_bytes & !(CHUNK_SIZE - 1); + // SAFETY: `body_bytes` is less than number of bytes remaining in `bytes`, so in bounds + let end_ptr = unsafe { ptr.add(body_bytes) }; + let last_chunk_bytes = remaining_bytes - body_bytes; + + while ptr < end_ptr { + // SAFETY: `ptr` was aligned to `CHUNK_ALIGNMENT` after processing 1st chunk. + // It is incremented in this loop by `CHUNK_SIZE`, which is a multiple of `CHUNK_ALIGNMENT`, + // so `ptr` remains always aligned for `CHUNK_ALIGNMENT`. + // `ptr < end_ptr` check ensures it's valid to read `CHUNK_SIZE` bytes starting at `ptr`. + #[expect(clippy::cast_ptr_alignment)] + let chunk = unsafe { ptr.cast::().as_ref().unwrap_unchecked() }; + if chunk.contains_unicode() { + // SAFETY: `ptr` is equal to or after `start_ptr`. Both are within bounds of `bytes`. + // `ptr` is derived from `start_ptr`. + let offset = unsafe { offset_from(ptr, start_ptr) }; + process_slice(&chunk.0, offset); + } + + ptr = unsafe { ptr.add(CHUNK_SIZE) }; + } + + // Process last chunk + if last_chunk_bytes > 0 { + // SAFETY: `ptr + last_chunk_bytes` goes up to end of `bytes`. + // `bytes` is a `&[u8]` so guaranteed initialized and valid for reads. + let last_chunk = unsafe { slice::from_raw_parts(ptr, last_chunk_bytes) }; + // SAFETY: `ptr` is after `start_ptr`. Both are within bounds of `bytes`. + // `ptr` is derived from `start_ptr`. + let offset = unsafe { offset_from(ptr, start_ptr) }; + process_slice(last_chunk, offset); + } +} + +/// Calculate distance in bytes from `from_ptr` to `to_ptr`. +/// +/// # SAFETY +/// * `from_ptr` must be before or equal to `to_ptr`. +/// * Both pointers must point to within the same object. +/// * Both pointers must be derived from the same original pointer. +#[inline] +unsafe fn offset_from(to_ptr: *const u8, from_ptr: *const u8) -> usize { + let offset = unsafe { to_ptr.offset_from(from_ptr) }; + unsafe { usize::try_from(offset).unwrap_unchecked() } +} + #[cfg(test)] mod test { use oxc_allocator::Allocator; @@ -459,12 +574,55 @@ mod test { ("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]), ]; + // Convert cases to `Vec` + let mut cases_vec = cases + .iter() + .map(|&(text, translations)| (text, translations.to_vec())) + .collect::>(); + + // Create 1 long string containing 99 repeats of each test case, concatenated + let repeats = 99u32; + let mut texts = String::new(); + for (text, _) in cases { + for _i in 0..repeats { + texts.push_str(text); + } + } + + // Generate more test cases for each of the defined cases repeated 99 times. + // Each case references a slice of the large `texts` string. + // Reason we do that is so that these string slices have uneven alignments, to exercise all parts + // of `build_translations`, which handles unaligned header/tail differently from the main body + // of the source text. + // The number of repeats is 99, for the same reason - to ensure each string slice begins at + // a memory address which is not evenly aligned. + let mut offset = 0; for &(text, translations) in cases { + let end_offset = offset + text.len() * (repeats as usize); + let repeated_text = &texts[offset..end_offset]; + + let (len_utf8, len_utf16) = *translations.last().unwrap(); + assert_eq!(text.len(), len_utf8 as usize); + + let mut repeated_translations = vec![]; + for i in 0..repeats { + for &(offset_utf8, offset_utf16) in translations { + repeated_translations + .push((offset_utf8 + len_utf8 * i, offset_utf16 + len_utf16 * i)); + } + } + + cases_vec.push((repeated_text, repeated_translations)); + + offset = end_offset; + } + + for (text, translations) in cases_vec { let table = Utf8ToUtf16::new(text); let converter = table.converter(); if let Some(mut converter) = converter { // Iterate in forwards order - for &(utf8_offset, expected_utf16_offset) in translations { + for &(utf8_offset, expected_utf16_offset) in &translations { let mut utf16_offset = utf8_offset; converter.convert_offset(&mut utf16_offset); assert_eq!(utf16_offset, expected_utf16_offset); @@ -478,7 +636,7 @@ mod test { } } else { // No Unicode chars. All offsets should be the same. - for &(utf8_offset, expected_utf16_offset) in translations { + for &(utf8_offset, expected_utf16_offset) in &translations { assert_eq!(utf8_offset, expected_utf16_offset); } }