Skip to content

Commit

Permalink
fix details
Browse files Browse the repository at this point in the history
Signed-off-by: martinvuyk <martin.vuyklop@gmail.com>
  • Loading branch information
martinvuyk committed Feb 1, 2025
1 parent 308c3c4 commit 00760f8
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 63 deletions.
50 changes: 33 additions & 17 deletions stdlib/src/builtin/char.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ from collections.string import StringSlice
from bit import count_leading_zeros

from memory import UnsafePointer
from sys.intrinsics import likely


@always_inline
Expand Down Expand Up @@ -423,13 +424,13 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable):
return self._scalar_value

@always_inline
fn unsafe_write_utf8(self, ptr: UnsafePointer[Byte]) -> UInt:
fn unsafe_write_utf8[
optimize_ascii: Bool = True
](self, ptr: UnsafePointer[Byte]) -> UInt:
"""Shift unicode to utf8 representation.
Safety:
`ptr` MUST point to at least `self.utf8_byte_length()` allocated
bytes or else an out-of-bounds write will occur, which is undefined
behavior.
Parameters:
optimize_ascii: Optimize for languages with mostly ASCII characters.
Args:
ptr: Pointer value to write the encoded UTF-8 bytes. Must validly
Expand All @@ -439,6 +440,11 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable):
Returns:
Returns the number of bytes written.
Safety:
`ptr` MUST point to at least `self.utf8_byte_length()` allocated
bytes or else an out-of-bounds write will occur, which is undefined
behavior.
### Unicode (represented as UInt32 BE) to UTF-8 conversion:
- 1: 00000000 00000000 00000000 0aaaaaaa -> 0aaaaaaa
- a
Expand All @@ -456,18 +462,28 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable):

var num_bytes = self.utf8_byte_length()

if num_bytes == 1:
ptr[0] = UInt8(c)
return 1

var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> UInt8(num_bytes + 1)
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | num_bytes_marker
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000

@parameter
if optimize_ascii:
if likely(num_bytes == 1):
ptr[0] = UInt8(c)
return 1
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + 1)
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | num_bytes_marker
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000
else:
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + Int(num_bytes > 1))
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | (
num_bytes_marker & -Int(num_bytes != 1)
)
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000
return num_bytes

@always_inline
Expand Down
1 change: 0 additions & 1 deletion stdlib/src/collections/string/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ from collections.string.string_slice import (
_StringSliceIter,
_to_string_list,
_utf8_byte_type,
_shift_unicode_to_utf8,
)

# ===----------------------------------------------------------------------=== #
Expand Down
45 changes: 0 additions & 45 deletions stdlib/src/collections/string/string_slice.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -76,51 +76,6 @@ fn _utf8_first_byte_sequence_length(b: Byte) -> Int:
return Int(count_leading_zeros(~b) | (b < 0b1000_0000).cast[DType.uint8]())


fn _shift_unicode_to_utf8[
optimize_ascii: Bool = True
](ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int):
"""Shift unicode to utf8 representation.
Parameters:
optimize_ascii: Optimize for languages with mostly ASCII characters.
### Unicode (represented as UInt32 BE) to UTF-8 conversion:
- 1: 00000000 00000000 00000000 0aaaaaaa -> 0aaaaaaa
- a
- 2: 00000000 00000000 00000aaa aabbbbbb -> 110aaaaa 10bbbbbb
- (a >> 6) | 0b11000000, b | 0b10000000
- 3: 00000000 00000000 aaaabbbb bbcccccc -> 1110aaaa 10bbbbbb 10cccccc
- (a >> 12) | 0b11100000, (b >> 6) | 0b10000000, c | 0b10000000
- 4: 00000000 000aaabb bbbbcccc ccdddddd -> 11110aaa 10bbbbbb 10cccccc
10dddddd
- (a >> 18) | 0b11110000, (b >> 12) | 0b10000000, (c >> 6) | 0b10000000,
d | 0b10000000
"""

@parameter
if optimize_ascii:
if likely(num_bytes == 1):
ptr[0] = UInt8(c)
return
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + 1)
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | num_bytes_marker
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000
else:
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + Int(num_bytes > 1))
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | (
num_bytes_marker & -Int(num_bytes != 1)
)
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000


@always_inline
fn _utf8_byte_type(b: SIMD[DType.uint8, _], /) -> __type_of(b):
"""UTF-8 byte type.
Expand Down

0 comments on commit 00760f8

Please sign in to comment.