Skip to content

Commit

Permalink
refactor(lexer): reduce scope of unsafe blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
overlookmotel committed Feb 24, 2025
1 parent 6764b8d commit 4d7da25
Show file tree
Hide file tree
Showing 4 changed files with 302 additions and 318 deletions.
84 changes: 41 additions & 43 deletions crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,51 +48,49 @@ impl<'a> Lexer<'a> {
/// * `self.source` must not be exhausted (at least 1 char remaining).
/// * Next char must be ASCII.
pub(super) unsafe fn identifier_name_handler(&mut self) -> &'a str {
unsafe {
// Advance past 1st byte.
// SAFETY: Caller guarantees not at EOF, and next byte is ASCII.
let after_first = self.source.position().add(1);

// Consume bytes which are part of identifier
let next_byte = byte_search! {
lexer: self,
table: NOT_ASCII_ID_CONTINUE_TABLE,
start: after_first,
handle_eof: {
// Return identifier minus its first char.
// SAFETY: `lexer.source` is positioned at EOF, so there is no valid value
// of `after_first` which could be after current position.
return unsafe { self.source.str_from_pos_to_current_unchecked(after_first) };
},
};

// Found a matching byte.
// Either end of identifier found, or a Unicode char, or `\` escape.
// Handle uncommon cases in cold branches to keep the common ASCII path
// as fast as possible.
if !next_byte.is_ascii() {
return cold_branch(|| {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_tail_unicode(start_pos)[1..]
});
}
if next_byte == b'\\' {
return cold_branch(|| {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_backslash(start_pos, false)[1..]
});
}
// Advance past 1st byte.
// SAFETY: Caller guarantees not at EOF, and next byte is ASCII.
let after_first = unsafe { self.source.position().add(1) };

// Return identifier minus its first char.
// SAFETY: `after_first` was position of `lexer.source` at start of this search.
// Searching only proceeds in forwards direction, so `lexer.source.position()`
// cannot be before `after_first`.
unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }
// Consume bytes which are part of identifier
let next_byte = byte_search! {
lexer: self,
table: NOT_ASCII_ID_CONTINUE_TABLE,
start: after_first,
handle_eof: {
// Return identifier minus its first char.
// SAFETY: `lexer.source` is positioned at EOF, so there is no valid value
// of `after_first` which could be after current position.
return unsafe { self.source.str_from_pos_to_current_unchecked(after_first) };
},
};

// Found a matching byte.
// Either end of identifier found, or a Unicode char, or `\` escape.
// Handle uncommon cases in cold branches to keep the common ASCII path
// as fast as possible.
if !next_byte.is_ascii() {
return cold_branch(|| {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_tail_unicode(start_pos)[1..]
});
}
if next_byte == b'\\' {
return cold_branch(|| {
// SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
// makes `start_pos` `source`'s position as it was at start of this function
let start_pos = unsafe { after_first.sub(1) };
&self.identifier_backslash(start_pos, false)[1..]
});
}

// Return identifier minus its first char.
// SAFETY: `after_first` was position of `lexer.source` at start of this search.
// Searching only proceeds in forwards direction, so `lexer.source.position()`
// cannot be before `after_first`.
unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }
}

/// Handle rest of identifier after first byte of a multi-byte Unicode char found.
Expand Down
35 changes: 17 additions & 18 deletions crates/oxc_parser/src/lexer/jsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,24 @@ impl Lexer<'_> {
/// * `delimiter` must be an ASCII character.
/// * Next char in `lexer.source` must be ASCII.
pub(super) unsafe fn read_jsx_string_literal(&mut self, delimiter: u8) -> Kind {
unsafe {
// Skip opening quote
debug_assert!(delimiter.is_ascii());
// SAFETY: Caller guarantees next byte is ASCII, so `.add(1)` is a UTF-8 char boundary
let after_opening_quote = self.source.position().add(1);
let remaining = self.source.str_from_pos_to_end(after_opening_quote);
debug_assert!(delimiter.is_ascii());

let len = memchr(delimiter, remaining.as_bytes());
if let Some(len) = len {
// SAFETY: `after_opening_quote` + `len` is position of delimiter.
// Caller guarantees delimiter is ASCII, so 1 byte after it is a UTF-8 char boundary.
let after_closing_quote = after_opening_quote.add(len + 1);
self.source.set_position(after_closing_quote);
Kind::Str
} else {
self.source.advance_to_end();
self.error(diagnostics::unterminated_string(self.unterminated_range()));
Kind::Undetermined
}
// Skip opening quote
// SAFETY: Caller guarantees next byte is ASCII, so `.add(1)` is a UTF-8 char boundary
let after_opening_quote = unsafe { self.source.position().add(1) };
let remaining = self.source.str_from_pos_to_end(after_opening_quote);

let len = memchr(delimiter, remaining.as_bytes());
if let Some(len) = len {
// SAFETY: `after_opening_quote` + `len` is position of delimiter.
// Caller guarantees delimiter is ASCII, so 1 byte after it is a UTF-8 char boundary.
let after_closing_quote = unsafe { after_opening_quote.add(len + 1) };
self.source.set_position(after_closing_quote);
Kind::Str
} else {
self.source.advance_to_end();
self.error(diagnostics::unterminated_string(self.unterminated_range()));
Kind::Undetermined
}
}

Expand Down
134 changes: 62 additions & 72 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,9 @@ impl<'a> Source<'a> {
&self,
pos: SourcePosition<'a>,
) -> &'a str {
unsafe {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
}
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) }
}

/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
Expand All @@ -250,11 +248,9 @@ impl<'a> Source<'a> {
&self,
pos: SourcePosition<'a>,
) -> &'a str {
unsafe {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
}
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) }
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
Expand All @@ -276,28 +272,28 @@ impl<'a> Source<'a> {
start: SourcePosition<'a>,
end: SourcePosition<'a>,
) -> &'a str {
// Check `start` is not after `end`
debug_assert!(start.ptr <= end.ptr);
// Check `start` and `end` are within bounds of `Source`
debug_assert!(start.ptr >= self.start);
debug_assert!(end.ptr <= self.end);
// Check `start` and `end` are on UTF-8 character boundaries.
// SAFETY: Above assertions ensure `start` and `end` are valid to read from if not at EOF.
debug_assert!(start.ptr == self.end || !is_utf8_cont_byte(unsafe { start.read() }));
debug_assert!(end.ptr == self.end || !is_utf8_cont_byte(unsafe { end.read() }));

// SAFETY: Caller guarantees `start` is not after `end`.
// `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same
// allocation, and derived from the same original pointer.
// Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
unsafe {
// Check `start` is not after `end`
debug_assert!(start.ptr <= end.ptr);
// Check `start` and `end` are within bounds of `Source`
debug_assert!(start.ptr >= self.start);
debug_assert!(end.ptr <= self.end);
// Check `start` and `end` are on UTF-8 character boundaries.
// SAFETY: Above assertions ensure `start` and `end` are valid to read from if not at EOF.
debug_assert!(start.ptr == self.end || !is_utf8_cont_byte(start.read()));
debug_assert!(end.ptr == self.end || !is_utf8_cont_byte(end.read()));

// SAFETY: Caller guarantees `start` is not after `end`.
// `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same
// allocation, and derived from the same original pointer.
// Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
let len = end.addr() - start.addr();
let slice = slice::from_raw_parts(start.ptr, len);
std::str::from_utf8_unchecked(slice)
Expand Down Expand Up @@ -447,14 +443,12 @@ impl<'a> Source<'a> {
#[expect(dead_code)]
#[inline]
unsafe fn next_byte(&mut self) -> Option<u8> {
unsafe {
#[expect(clippy::if_not_else)] // Hot path first
if !self.is_eof() {
// SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds
Some(self.next_byte_unchecked())
} else {
None
}
#[expect(clippy::if_not_else)] // Hot path first
if !self.is_eof() {
// SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds
Some(unsafe { self.next_byte_unchecked() })
} else {
None
}
}

Expand All @@ -480,11 +474,11 @@ impl<'a> Source<'a> {
/// are *not* safe to call until one of above conditions is satisfied.
#[inline]
pub(super) unsafe fn next_byte_unchecked(&mut self) -> u8 {
// SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
// Methods of this type provide no way for `ptr` to be before `start` or after `end`.
// Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
// in `ptr > end`.
unsafe {
// SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
// Methods of this type provide no way for `ptr` to be before `start` or after `end`.
// Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
// in `ptr > end`.
let byte = self.peek_byte_unchecked();
self.ptr = self.ptr.add(1);
byte
Expand Down Expand Up @@ -547,15 +541,13 @@ impl<'a> Source<'a> {
/// Caller must ensure `Source` is not at end of file.
#[inline]
pub(super) unsafe fn peek_byte_unchecked(&self) -> u8 {
unsafe {
// SAFETY: Caller guarantees `ptr` is before `end` (i.e. not at end of file).
// Methods of this type provide no way to allow `ptr` to be before `start`.
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
self.position().read()
}
// SAFETY: Caller guarantees `ptr` is before `end` (i.e. not at end of file).
// Methods of this type provide no way to allow `ptr` to be before `start`.
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
unsafe { self.position().read() }
}
}

Expand Down Expand Up @@ -631,17 +623,15 @@ impl SourcePosition<'_> {
/// because if a `&mut` reference existed, that would violate Rust's aliasing rules.
#[inline]
pub(super) unsafe fn read(self) -> u8 {
unsafe {
// SAFETY:
// Caller guarantees `self` is not at end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
*self.ptr.as_ref().unwrap_unchecked()
}
// SAFETY:
// Caller guarantees `self` is not at end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
unsafe { *self.ptr.as_ref().unwrap_unchecked() }
}

/// Read 2 bytes from this `SourcePosition`.
Expand All @@ -651,16 +641,16 @@ impl SourcePosition<'_> {
/// i.e. if source length is 10, `self` must be on position 8 max.
#[inline]
pub(super) unsafe fn read2(self) -> [u8; 2] {
// SAFETY:
// Caller guarantees `self` is not at no later than 2 bytes before end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
#[expect(clippy::ptr_as_ptr)]
unsafe {
// SAFETY:
// Caller guarantees `self` is not at no later than 2 bytes before end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
#[expect(clippy::ptr_as_ptr)]
let p = self.ptr as *const [u8; 2];
*p.as_ref().unwrap_unchecked()
}
Expand Down
Loading

0 comments on commit 4d7da25

Please sign in to comment.