diff --git a/docs/changelog.md b/docs/changelog.md index 50d62e1dde..5c87c6b054 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -236,6 +236,19 @@ what we publish. # ... ``` +- The `String.__len__()` and `StringSlice.__len__()` methods now return the + length of the string in bytes. + + Previously, these methods were documented to note that they would eventually + return a length in Unicode codepoints. They have been changed to guarantee + a length in bytes, since the length in bytes is how they are most often used + today (for example, as bounds to low-level memory manipulation logic). + Additionally, length in codepoints is a more specialized notion of string + length that is rarely the correct metric. + + Users that know they need the length in codepoints can use the + `str.char_length()` method, or `len(str.chars())`. + - Various functionality has moved from `String` and `StringRef` to the more general `StringSlice` type. diff --git a/stdlib/src/collections/string/string.mojo b/stdlib/src/collections/string/string.mojo index 58727d8927..bd6340cbf5 100644 --- a/stdlib/src/collections/string/string.mojo +++ b/stdlib/src/collections/string/string.mojo @@ -1063,22 +1063,46 @@ struct String( """ return self.byte_length() > 0 + @always_inline fn __len__(self) -> Int: - """Gets the string length, in bytes (for now) PREFER: - String.byte_length(), a future version will make this method return - Unicode codepoints. + """Get the string length of in bytes. + + This function returns the number of bytes in the underlying UTF-8 + representation of the string. + + To get the number of Unicode codepoints in a string, use + `len(str.chars())`. Returns: - The string length, in bytes (for now). - """ - var unicode_length = self.byte_length() + The string length in bytes. + + # Examples - # TODO: everything uses this method assuming it's byte length - # for i in range(unicode_length): - # if _utf8_byte_type(self._buffer[i]) == 1: - # unicode_length -= 1 + Query the length of a string, in bytes and Unicode codepoints: - return unicode_length + ```mojo + from testing import assert_equal + + var s = String("ನಮಸ್ಕಾರ") + + assert_equal(len(s), 21) + assert_equal(len(s.chars()), 7) + ``` + + Strings containing only ASCII characters have the same byte and + Unicode codepoint length: + + ```mojo + from testing import assert_equal + + var s = String("abc") + + assert_equal(len(s), 3) + assert_equal(len(s.chars()), 3) + ``` + . + """ + return self.byte_length() @always_inline fn __str__(self) -> String: diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo index c6425bf838..5f0bf7ec7b 100644 --- a/stdlib/src/collections/string/string_slice.mojo +++ b/stdlib/src/collections/string/string_slice.mojo @@ -547,13 +547,48 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( else: return "'" + result + "'" + @always_inline fn __len__(self) -> Int: - """Nominally returns the _length in Unicode codepoints_ (not bytes!). + """Get the string length in bytes. + + This function returns the number of bytes in the underlying UTF-8 + representation of the string. + + To get the number of Unicode codepoints in a string, use + `len(str.chars())`. Returns: - The length in Unicode codepoints. + The string length in bytes. + + # Examples + + Query the length of a string, in bytes and Unicode codepoints: + + ```mojo + from collections.string import StringSlice + from testing import assert_equal + + var s = StringSlice("ನಮಸ್ಕಾರ") + + assert_equal(len(s), 21) + assert_equal(len(s.chars()), 7) + ``` + + Strings containing only ASCII characters have the same byte and + Unicode codepoint length: + + ```mojo + from collections.string import StringSlice + from testing import assert_equal + + var s = StringSlice("abc") + + assert_equal(len(s), 3) + assert_equal(len(s.chars()), 3) + ``` + . """ - return self.char_length() + return self.byte_length() fn write_to[W: Writer](self, mut writer: W): """Formats this string slice to the provided `Writer`. @@ -1012,6 +1047,46 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( Returns: The length in Unicode codepoints. + + # Examples + + Query the length of a string, in bytes and Unicode codepoints: + + ```mojo + from collections.string import StringSlice + from testing import assert_equal + + var s = StringSlice("ನಮಸ್ಕಾರ") + + assert_equal(s.char_length(), 7) + assert_equal(len(s), 21) + ``` + + Strings containing only ASCII characters have the same byte and + Unicode codepoint length: + + ```mojo + from collections.string import StringSlice + from testing import assert_equal + + var s = StringSlice("abc") + + assert_equal(s.char_length(), 3) + assert_equal(len(s), 3) + ``` + + The character length of a string with visual combining characters is + the length in Unicode codepoints, not grapheme clusters: + + ```mojo + from collections.string import StringSlice + from testing import assert_equal + + var s = StringSlice("á") + assert_equal(s.char_length(), 2) + assert_equal(s.byte_length(), 3) + ``` + . """ # Every codepoint is encoded as one leading byte + 0 to 3 continuation # bytes. diff --git a/stdlib/test/collections/string/test_string.mojo b/stdlib/test/collections/string/test_string.mojo index 945406f046..fd070192d4 100644 --- a/stdlib/test/collections/string/test_string.mojo +++ b/stdlib/test/collections/string/test_string.mojo @@ -89,6 +89,20 @@ def test_copy(): assert_equal("fine", s1) +def test_len(): + # String length is in bytes, not codepoints. + var s0 = String("ನಮಸ್ಕಾರ") + + assert_equal(len(s0), 21) + assert_equal(len(s0.chars()), 7) + + # For ASCII string, the byte and codepoint length are the same: + var s1 = String("abc") + + assert_equal(len(s1), 3) + assert_equal(len(s1.chars()), 3) + + def test_equality_operators(): var s0 = String("abc") var s1 = String("def") @@ -1475,6 +1489,7 @@ def test_reserve(): def main(): test_constructors() test_copy() + test_len() test_equality_operators() test_comparison_operators() test_add() diff --git a/stdlib/test/collections/string/test_string_slice.mojo b/stdlib/test/collections/string/test_string_slice.mojo index 105c88c159..166bbbf297 100644 --- a/stdlib/test/collections/string/test_string_slice.mojo +++ b/stdlib/test/collections/string/test_string_slice.mojo @@ -151,23 +151,22 @@ fn test_heap_string_from_string_slice() raises: fn test_slice_len() raises: - alias str1: StringLiteral = "12345" - alias str2: StringLiteral = "1234" - alias str3: StringLiteral = "123" - alias str4: StringLiteral = "12" - alias str5: StringLiteral = "1" + assert_equal(5, len(StringSlice("12345"))) + assert_equal(4, len(StringSlice("1234"))) + assert_equal(3, len(StringSlice("123"))) + assert_equal(2, len(StringSlice("12"))) + assert_equal(1, len(StringSlice("1"))) + assert_equal(0, len(StringSlice(""))) - alias slice1 = str1.as_string_slice() - alias slice2 = str2.as_string_slice() - alias slice3 = str3.as_string_slice() - alias slice4 = str4.as_string_slice() - alias slice5 = str5.as_string_slice() + # String length is in bytes, not codepoints. + var s0 = String("ನಮಸ್ಕಾರ") + assert_equal(len(s0), 21) + assert_equal(len(s0.chars()), 7) - assert_equal(5, len(slice1)) - assert_equal(4, len(slice2)) - assert_equal(3, len(slice3)) - assert_equal(2, len(slice4)) - assert_equal(1, len(slice5)) + # For ASCII string, the byte and codepoint length are the same: + var s1 = String("abc") + assert_equal(len(s1), 3) + assert_equal(len(s1.chars()), 3) fn test_slice_char_length() raises: @@ -189,6 +188,13 @@ fn test_slice_char_length() raises: assert_equal(s3.byte_length(), 37) assert_equal(s3.char_length(), 19) + # Character length is codepoints, not graphemes + # This is thumbs up + a skin tone modifier codepoint. + var s4 = StringSlice("👍🏻") + assert_equal(s4.byte_length(), 8) + assert_equal(s4.char_length(), 2) + # TODO: assert_equal(s4.grapheme_count(), 1) + fn test_slice_eq() raises: var str1: String = "12345"