Skip to content

Commit

Permalink
perf(ast/estree): speed up building UTF8-UTF16 translation table with…
Browse files Browse the repository at this point in the history
… SIMD
  • Loading branch information
overlookmotel committed Feb 25, 2025
1 parent 898ab78 commit ba4be9f
Showing 1 changed file with 193 additions and 35 deletions.
228 changes: 193 additions & 35 deletions crates/oxc_ast/src/utf8_to_utf16.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Convert UTF-8 span offsets to UTF-16.
use std::cmp::min;
use std::{cmp::min, slice};

use oxc_span::Span;
use oxc_syntax::module_record::{ModuleRecord, VisitMutModuleRecord};
Expand Down Expand Up @@ -33,40 +33,10 @@ impl Utf8ToUtf16 {
/// Create new [`Utf8ToUtf16`] conversion table from source text.
pub fn new(source_text: &str) -> Self {
let mut translations = Vec::with_capacity(16);

translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });

// Translation from UTF-8 byte offset to UTF-16 char offset:
//
// * 1-byte UTF-8 sequence
// = 1st byte 0xxxxxxx (0 - 0x7F)
// -> 1 x UTF-16 char
// UTF-16 len = UTF-8 len
// * 2-byte UTF-8 sequence
// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 1
// * 3-byte UTF-8 sequence
// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 2
// * 4-byte UTF-8 sequence
// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 2 x UTF-16
// UTF-16 len = UTF-8 len - 2
//
// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
let mut utf16_difference = 0;
#[expect(clippy::cast_possible_truncation)]
for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
if byte >= 0xC0 {
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
utf16_difference += difference_for_this_byte;
// Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
// Unicode character that need to be shifted
translations
.push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
}
}
build_translations(source_text, &mut translations);

// If no translations have been added after the first `0, 0` dummy, then source is entirely ASCII.
// Remove the dummy entry.
Expand Down Expand Up @@ -377,6 +347,151 @@ impl VisitMutModuleRecord for Utf8ToUtf16Converter<'_> {
}
}

const CHUNK_SIZE: usize = 32;
const CHUNK_ALIGNMENT: usize = align_of::<AlignedChunk>();
const _: () = {
assert!(CHUNK_SIZE >= CHUNK_ALIGNMENT);
assert!(CHUNK_SIZE % CHUNK_ALIGNMENT == 0);
};

#[repr(C, align(16))]
struct AlignedChunk([u8; CHUNK_SIZE]);

impl AlignedChunk {
// This boils down to 3 x SIMD ops to check 32 bytes for Unicode bytes in one go.
// https://godbolt.org/z/e3rGd8sEa
#[inline]
fn contains_unicode(&self) -> bool {
for index in 0..self.0.len() {
if self.0[index] >= 128 {
return true;
}
}
false
}
}

/// Build table of translations from UTF-8 offsets to UTF-16 offsets.
///
/// Process bulk of source text in chunks of 32 bytes, using SIMD instructions.
/// This should be much faster than byte-by-byte processing, assuming non-ASCII chars are rare in source code.
///
/// Translation as follows:
///
/// * 1-byte UTF-8 sequence
/// = 1st byte 0xxxxxxx (0 - 0x7F)
/// -> 1 x UTF-16 char
/// UTF-16 len = UTF-8 len
/// * 2-byte UTF-8 sequence
/// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
/// -> 1 x UTF-16
/// UTF-16 len = UTF-8 len - 1
/// * 3-byte UTF-8 sequence
/// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
/// -> 1 x UTF-16
/// UTF-16 len = UTF-8 len - 2
/// * 4-byte UTF-8 sequence
/// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
/// -> 2 x UTF-16
/// UTF-16 len = UTF-8 len - 2
///
/// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
fn build_translations(source_text: &str, translations: &mut Vec<Translation>) {
// Running counter of difference between UTF-8 and UTF-16 offset
let mut utf16_difference = 0;

// Closure that processes a slice of bytes.
#[expect(clippy::cast_possible_truncation)]
let mut process_slice = |slice: &[u8], start_offset: usize| {
for (index, &byte) in slice.iter().enumerate() {
if byte >= 0xC0 {
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
utf16_difference += difference_for_this_byte;
// Record `offset + 1` not `offset`, because it's only offsets *after* this
// Unicode character that need to be shifted
let offset = (start_offset + index) as u32;
translations.push(Translation { utf8_offset: offset + 1, utf16_difference });
}
}
};

// If source text is short, just process byte-by-byte
let bytes = source_text.as_bytes();
if bytes.len() < CHUNK_SIZE {
process_slice(bytes, 0);
return;
}

// Process first few bytes of source
let start_ptr = bytes.as_ptr();
let mut ptr = start_ptr;

let first_chunk_bytes = start_ptr.align_offset(CHUNK_ALIGNMENT);
if first_chunk_bytes > 0 {
// SAFETY: `first_chunk_bytes` is less than `CHUNK_ALIGNMENT`, which in turn is no bigger than
// `CHUNK_SIZE`. We already exited if source is shorter than `CHUNK_SIZE` bytes, so
// there must be at least `first_chunk_bytes` bytes in source.
let first_chunk = unsafe { slice::from_raw_parts(ptr, first_chunk_bytes) };
process_slice(first_chunk, 0);
// SAFETY: For reasons given above, `first_chunk_bytes` must be in bounds
ptr = unsafe { ptr.add(first_chunk_bytes) };
}

// Process main body as aligned chunks of 32 bytes.
//
// We've aligned `ptr` to `CHUNK_ALIGNMENT`, so can now read the rest of source as `AlignedChunk`s
// (apart from a few bytes on end which may not be enough to make a whole `AlignedChunk`).
//
// Do a fast check for any non-ASCII bytes in each chunk using SIMD.
// Only if there are some non-ASCII bytes, process the chunk byte-by-byte.

let remaining_bytes = bytes.len() - first_chunk_bytes;
let body_bytes = remaining_bytes & !(CHUNK_SIZE - 1);
// SAFETY: `body_bytes` is less than number of bytes remaining in `bytes`, so in bounds
let end_ptr = unsafe { ptr.add(body_bytes) };
let last_chunk_bytes = remaining_bytes - body_bytes;

while ptr < end_ptr {
// SAFETY: `ptr` was aligned to `CHUNK_ALIGNMENT` after processing 1st chunk.
// It is incremented in this loop by `CHUNK_SIZE`, which is a multiple of `CHUNK_ALIGNMENT`,
// so `ptr` remains always aligned for `CHUNK_ALIGNMENT`.
// `ptr < end_ptr` check ensures it's valid to read `CHUNK_SIZE` bytes starting at `ptr`.
#[expect(clippy::cast_ptr_alignment)]
let chunk = unsafe { ptr.cast::<AlignedChunk>().as_ref().unwrap_unchecked() };
if chunk.contains_unicode() {
// SAFETY: `ptr` is equal to or after `start_ptr`. Both are within bounds of `bytes`.
// `ptr` is derived from `start_ptr`.
let offset = unsafe { offset_from(ptr, start_ptr) };
process_slice(&chunk.0, offset);
}

ptr = unsafe { ptr.add(CHUNK_SIZE) };
}

// Process last chunk
if last_chunk_bytes > 0 {
// SAFETY: `ptr + last_chunk_bytes` goes up to end of `bytes`.
// `bytes` is a `&[u8]` so guaranteed initialized and valid for reads.
let last_chunk = unsafe { slice::from_raw_parts(ptr, last_chunk_bytes) };
// SAFETY: `ptr` is after `start_ptr`. Both are within bounds of `bytes`.
// `ptr` is derived from `start_ptr`.
let offset = unsafe { offset_from(ptr, start_ptr) };
process_slice(last_chunk, offset);
}
}

/// Calculate distance in bytes from `from_ptr` to `to_ptr`.
///
/// # SAFETY
/// * `from_ptr` must be before or equal to `to_ptr`.
/// * Both pointers must point to within the same object.
/// * Both pointers must be derived from the same original pointer.
#[inline]
unsafe fn offset_from(to_ptr: *const u8, from_ptr: *const u8) -> usize {
let offset = unsafe { to_ptr.offset_from(from_ptr) };
unsafe { usize::try_from(offset).unwrap_unchecked() }
}

#[cfg(test)]
mod test {
use oxc_allocator::Allocator;
Expand Down Expand Up @@ -459,12 +574,55 @@ mod test {
("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
];

// Convert cases to `Vec`
let mut cases_vec = cases
.iter()
.map(|&(text, translations)| (text, translations.to_vec()))
.collect::<Vec<_>>();

// Create 1 long string containing 99 repeats of each test case, concatenated
let repeats = 99u32;
let mut texts = String::new();
for (text, _) in cases {
for _i in 0..repeats {
texts.push_str(text);
}
}

// Generate more test cases for each of the defined cases repeated 99 times.
// Each case references a slice of the large `texts` string.
// Reason we do that is so that these string slices have uneven alignments, to exercise all parts
// of `build_translations`, which handles unaligned header/tail differently from the main body
// of the source text.
// The number of repeats is 99, for the same reason - to ensure each string slice begins at
// a memory address which is not evenly aligned.
let mut offset = 0;
for &(text, translations) in cases {
let end_offset = offset + text.len() * (repeats as usize);
let repeated_text = &texts[offset..end_offset];

let (len_utf8, len_utf16) = *translations.last().unwrap();
assert_eq!(text.len(), len_utf8 as usize);

let mut repeated_translations = vec![];
for i in 0..repeats {
for &(offset_utf8, offset_utf16) in translations {
repeated_translations
.push((offset_utf8 + len_utf8 * i, offset_utf16 + len_utf16 * i));
}
}

cases_vec.push((repeated_text, repeated_translations));

offset = end_offset;
}

for (text, translations) in cases_vec {
let table = Utf8ToUtf16::new(text);
let converter = table.converter();
if let Some(mut converter) = converter {
// Iterate in forwards order
for &(utf8_offset, expected_utf16_offset) in translations {
for &(utf8_offset, expected_utf16_offset) in &translations {
let mut utf16_offset = utf8_offset;
converter.convert_offset(&mut utf16_offset);
assert_eq!(utf16_offset, expected_utf16_offset);
Expand All @@ -478,7 +636,7 @@ mod test {
}
} else {
// No Unicode chars. All offsets should be the same.
for &(utf8_offset, expected_utf16_offset) in translations {
for &(utf8_offset, expected_utf16_offset) in &translations {
assert_eq!(utf8_offset, expected_utf16_offset);
}
}
Expand Down

0 comments on commit ba4be9f

Please sign in to comment.