diff --git a/mojo/stdlib/benchmarks/collections/bench_string.mojo b/mojo/stdlib/benchmarks/collections/bench_string.mojo index 8b3acdc9f0..17370d459b 100644 --- a/mojo/stdlib/benchmarks/collections/bench_string.mojo +++ b/mojo/stdlib/benchmarks/collections/bench_string.mojo @@ -15,7 +15,7 @@ # the -t flag. Remember to replace it again before pushing any code. from collections import Dict, Optional -from collections.string import String +from collections.string import String, StringSlice from collections.string._utf8_validation import _is_valid_utf8 from os import abort from pathlib import _dir_of_current_file @@ -110,7 +110,7 @@ fn bench_string_split[ @always_inline @parameter fn call_fn() raises: - var res: List[String] + var res: List[StringSlice[__origin_of(items)]] @parameter if sequence: diff --git a/mojo/stdlib/src/builtin/file.mojo b/mojo/stdlib/src/builtin/file.mojo index e3ba22bf27..1e74cefa6c 100644 --- a/mojo/stdlib/src/builtin/file.mojo +++ b/mojo/stdlib/src/builtin/file.mojo @@ -37,7 +37,8 @@ from sys.ffi import OpaquePointer from memory import AddressSpace, Span, UnsafePointer -from utils import StringSlice, write_buffered +from collections.string import StringSlice +from utils import write_buffered @register_passable diff --git a/mojo/stdlib/src/builtin/string_literal.mojo b/mojo/stdlib/src/builtin/string_literal.mojo index f72938df4b..081b34628c 100644 --- a/mojo/stdlib/src/builtin/string_literal.mojo +++ b/mojo/stdlib/src/builtin/string_literal.mojo @@ -21,7 +21,8 @@ from collections.string.string_slice import ( StaticString, StringSlice, CodepointSliceIter, - _to_string_list, + to_string_list, + _split, ) from hashlib._hasher import _HashableWithHasher, _Hasher from sys.ffi import c_char @@ -457,18 +458,18 @@ struct StringLiteral( return self.__str__() fn __iter__(ref self) -> CodepointSliceIter[StaticConstantOrigin]: - """Return an iterator over the string literal. + """Iterate over the string unicode characters. Returns: - An iterator over the string. + An iterator of references to the string unicode characters. """ return CodepointSliceIter(self.as_string_slice()) fn __reversed__(self) -> CodepointSliceIter[StaticConstantOrigin, False]: - """Iterate backwards over the string, returning immutable references. + """Iterate backwards over the string unicode characters. Returns: - A reversed iterator over the string. + A reversed iterator of references to the string unicode characters. """ return CodepointSliceIter[StaticConstantOrigin, False]( self.as_string_slice() @@ -540,21 +541,18 @@ struct StringLiteral( Returns: A string slice pointing to this static string literal. """ - - # FIXME(MSTDL-160): - # Enforce UTF-8 encoding in StringLiteral so this is actually - # guaranteed to be valid. - return StaticString(ptr=self.unsafe_ptr(), length=self.byte_length()) + return StaticString(self) @always_inline fn as_bytes(self) -> Span[Byte, StaticConstantOrigin]: - """ - Returns a contiguous Span of the bytes owned by this string. + """Returns a contiguous slice of the bytes owned by this string. Returns: A contiguous slice pointing to the bytes owned by this string. - """ + Notes: + This does not include the trailing null terminator. + """ return Span[Byte, StaticConstantOrigin]( ptr=self.unsafe_ptr(), length=self.byte_length() ) @@ -569,7 +567,6 @@ struct StringLiteral( Notes: This does not include the trailing null terminator. """ - # Does NOT include the NUL terminator. return Span[Byte, __origin_of(self)]( ptr=self.unsafe_ptr(), length=self.byte_length() ) @@ -618,11 +615,11 @@ struct StringLiteral( `start`. If not found, returns -1. Args: - substr: The substring to find. - start: The offset from which to find. + substr: The substring to find. + start: The offset from which to find. Returns: - The offset of `substr` relative to the beginning of the string. + The offset of `substr` relative to the beginning of the string. """ return self.as_string_slice().find(substr, start=start) @@ -687,13 +684,36 @@ struct StringLiteral( """ return String(elems, sep=self) - fn split(self, sep: StringSlice, maxsplit: Int = -1) raises -> List[String]: - """Split the string literal by a separator. + @always_inline + fn split(self, sep: StringSlice, maxsplit: Int) -> List[StaticString]: + """Split the string by a separator. Args: sep: The string to split on. maxsplit: The maximum amount of items to split from String. - Defaults to unlimited. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + + ```mojo + # Splitting with maxsplit + _ = "1,2,3".split(",", maxsplit=1) # ['1', '2,3'] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",", maxsplit=1) # ['', '1,2,3,'] + _ = "123".split("", maxsplit=1) # ['', '123'] + ``` + . + """ + return self.as_string_slice().split(sep, maxsplit) + + @always_inline + fn split(self, sep: StringSlice) -> List[StaticString]: + """Split the string by a separator. + + Args: + sep: The string to split on. Returns: A List of Strings containing the input split by the separator. @@ -705,20 +725,40 @@ struct StringLiteral( _ = "hello world".split(" ") # ["hello", "world"] # Splitting adjacent separators _ = "hello,,world".split(",") # ["hello", "", "world"] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",") # ['', '1', '2', '3', ''] + _ = "123".split("") # ['', '1', '2', '3', ''] + ``` + . + """ + return self.as_string_slice().split(sep) + + @always_inline + fn split(self, *, maxsplit: Int) -> List[StaticString]: + """Split the string by every Whitespace separator. + + Args: + maxsplit: The maximum amount of items to split from String. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + + ```mojo # Splitting with maxsplit - _ = "1,2,3".split(",", 1) # ['1', '2,3'] + _ = "1 2 3".split(maxsplit=1) # ['1', '2 3'] ``` . """ - return String(self).split(sep, maxsplit) + return self.as_string_slice().split(maxsplit=maxsplit) - fn split(self, sep: NoneType = None, maxsplit: Int = -1) -> List[String]: - """Split the string literal by every whitespace separator. + @always_inline + fn split(self, sep: NoneType = None) -> List[StaticString]: + """Split the string by every Whitespace separator. Args: sep: None. - maxsplit: The maximum amount of items to split from string. Defaults - to unlimited. Returns: A List of Strings containing the input split by the separator. @@ -729,16 +769,16 @@ struct StringLiteral( # Splitting an empty string or filled with whitespaces _ = " ".split() # [] _ = "".split() # [] - # Splitting a string with leading, trailing, and middle whitespaces _ = " hello world ".split() # ["hello", "world"] # Splitting adjacent universal newlines: - _ = "hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world".split() - # ["hello", "world"] + _ = ( + "hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world" + ).split() # ["hello", "world"] ``` . """ - return String(self).split(sep, maxsplit) + return self.as_string_slice().split() fn splitlines(self, keepends: Bool = False) -> List[String]: """Split the string literal at line boundaries. This corresponds to Python's @@ -752,7 +792,7 @@ struct StringLiteral( Returns: A List of Strings containing the input split by line boundaries. """ - return _to_string_list(self.as_string_slice().splitlines(keepends)) + return to_string_list(self.as_string_slice().splitlines(keepends)) fn count(self, substr: StringSlice) -> Int: """Return the number of non-overlapping occurrences of substring diff --git a/mojo/stdlib/src/collections/string/string.mojo b/mojo/stdlib/src/collections/string/string.mojo index 85e121f004..8411ef5388 100644 --- a/mojo/stdlib/src/collections/string/string.mojo +++ b/mojo/stdlib/src/collections/string/string.mojo @@ -41,6 +41,19 @@ from python import PythonObject from utils import IndexList, Variant, Writable, Writer, write_args from utils.write import _TotalWritableBytes, _WriteBufferHeap, write_buffered +from collections.string._unicode import ( + is_lowercase, + is_uppercase, + to_lowercase, + to_uppercase, +) +from collections.string.format import _CurlyEntryFormattable, _FormatCurlyEntry +from collections.string.string_slice import ( + StringSlice, + StaticString, + to_string_list, + _utf8_byte_type, +) # ===----------------------------------------------------------------------=== # # ord @@ -1034,20 +1047,19 @@ struct String( """ self._iadd(other.as_bytes()) - @deprecated("Use `str.codepoints()` or `str.codepoint_slices()` instead.") fn __iter__(self) -> CodepointSliceIter[__origin_of(self)]: - """Iterate over the string, returning immutable references. + """Iterate over the string unicode characters. Returns: - An iterator of references to the string elements. + An iterator of references to the string unicode characters. """ return self.codepoint_slices() fn __reversed__(self) -> CodepointSliceIter[__origin_of(self), False]: - """Iterate backwards over the string, returning immutable references. + """Iterate backwards over the string unicode characters. Returns: - A reversed iterator of references to the string elements. + A reversed iterator of references to the string unicode characters. """ return CodepointSliceIter[__origin_of(self), forward=False](self) @@ -1305,10 +1317,8 @@ struct String( Notes: This does not include the trailing null terminator. """ - - # Does NOT include the NUL terminator. return Span[Byte, __origin_of(self)]( - ptr=self._buffer.unsafe_ptr(), length=self.byte_length() + ptr=self.unsafe_ptr(), length=self.byte_length() ) @always_inline @@ -1412,44 +1422,85 @@ struct String( """ return self.as_string_slice().isspace() - # TODO(MSTDL-590): String.split() should return `StringSlice`s. - fn split(self, sep: StringSlice, maxsplit: Int = -1) raises -> List[String]: + @always_inline + fn split( + self, sep: StringSlice, maxsplit: Int + ) -> List[StringSlice[__origin_of(self)]]: """Split the string by a separator. Args: sep: The string to split on. maxsplit: The maximum amount of items to split from String. - Defaults to unlimited. Returns: A List of Strings containing the input split by the separator. - Raises: - If the separator is empty. + Examples: + + ```mojo + # Splitting with maxsplit + _ = "1,2,3".split(",", maxsplit=1) # ['1', '2,3'] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",", maxsplit=1) # ['', '1,2,3,'] + _ = "123".split("", maxsplit=1) # ['', '123'] + ``` + . + """ + return self.as_string_slice().split(sep, maxsplit) + + @always_inline + fn split(self, sep: StringSlice) -> List[StringSlice[__origin_of(self)]]: + """Split the string by a separator. + + Args: + sep: The string to split on. + + Returns: + A List of Strings containing the input split by the separator. Examples: ```mojo # Splitting a space - _ = String("hello world").split(" ") # ["hello", "world"] + _ = "hello world".split(" ") # ["hello", "world"] # Splitting adjacent separators - _ = String("hello,,world").split(",") # ["hello", "", "world"] + _ = "hello,,world".split(",") # ["hello", "", "world"] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",") # ['', '1', '2', '3', ''] + _ = "123".split("") # ['', '1', '2', '3', ''] + ``` + . + """ + return self.as_string_slice().split(sep) + + @always_inline + fn split(self, *, maxsplit: Int) -> List[StringSlice[__origin_of(self)]]: + """Split the string by every Whitespace separator. + + Args: + maxsplit: The maximum amount of items to split from String. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + + ```mojo # Splitting with maxsplit - _ = String("1,2,3").split(",", 1) # ['1', '2,3'] + _ = "1 2 3".split(maxsplit=1) # ['1', '2 3'] ``` . """ - return self.as_string_slice().split[sep.mut, sep.origin]( - sep, maxsplit=maxsplit - ) + return self.as_string_slice().split(maxsplit=maxsplit) - fn split(self, sep: NoneType = None, maxsplit: Int = -1) -> List[String]: + @always_inline + fn split( + self, sep: NoneType = None + ) -> List[StringSlice[__origin_of(self)]]: """Split the string by every Whitespace separator. Args: sep: None. - maxsplit: The maximum amount of items to split from String. Defaults - to unlimited. Returns: A List of Strings containing the input split by the separator. @@ -1458,31 +1509,18 @@ struct String( ```mojo # Splitting an empty string or filled with whitespaces - _ = String(" ").split() # [] - _ = String("").split() # [] - + _ = " ".split() # [] + _ = "".split() # [] # Splitting a string with leading, trailing, and middle whitespaces - _ = String(" hello world ").split() # ["hello", "world"] + _ = " hello world ".split() # ["hello", "world"] # Splitting adjacent universal newlines: - _ = String( + _ = ( "hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world" ).split() # ["hello", "world"] ``` . """ - - # TODO(MSTDL-590): Avoid the need to loop to convert `StringSlice` to - # `String` by making `String.split()` return `StringSlice`s. - var str_slices = self.as_string_slice()._split_whitespace( - maxsplit=maxsplit - ) - - var output = List[String](capacity=len(str_slices)) - - for str_slice in str_slices: - output.append(String(str_slice[])) - - return output^ + return self.as_string_slice().split() fn splitlines(self, keepends: Bool = False) -> List[String]: """Split the string at line boundaries. This corresponds to Python's @@ -1496,7 +1534,7 @@ struct String( Returns: A List of Strings containing the input split by line boundaries. """ - return _to_string_list(self.as_string_slice().splitlines(keepends)) + return to_string_list(self.as_string_slice().splitlines(keepends)) fn replace(self, old: StringSlice, new: StringSlice) -> String: """Return a copy of the string with all occurrences of substring `old` diff --git a/mojo/stdlib/src/collections/string/string_slice.mojo b/mojo/stdlib/src/collections/string/string_slice.mojo index 7ec78dc993..593c6145cd 100644 --- a/mojo/stdlib/src/collections/string/string_slice.mojo +++ b/mojo/stdlib/src/collections/string/string_slice.mojo @@ -548,7 +548,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( @implicit fn __init__[ O: ImmutableOrigin, // - ](mut self: StringSlice[O], ref [O]value: String): + ](out self: StringSlice[O], ref [O]value: String): """Construct an immutable StringSlice. Parameters: @@ -863,20 +863,19 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( self.unsafe_ptr(), rhs.unsafe_ptr(), min(len1, len2) ) - @deprecated("Use `str.codepoints()` or `str.codepoint_slices()` instead.") fn __iter__(self) -> CodepointSliceIter[origin]: - """Iterate over the string, returning immutable references. + """Iterate over the string, returning unicode characters. Returns: - An iterator of references to the string elements. + An iterator of references to the string unicode characters. """ return self.codepoint_slices() fn __reversed__(self) -> CodepointSliceIter[origin, False]: - """Iterate backwards over the string, returning immutable references. + """Iterate backwards over the string, returning unicode characters. Returns: - A reversed iterator of references to the string elements. + A reversed iterator of references to the string unicode characters. """ return CodepointSliceIter[origin, forward=False](self) @@ -949,159 +948,6 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( # Methods # ===------------------------------------------------------------------===# - fn split[ - sep_mut: Bool, - sep_origin: Origin[sep_mut], //, - ]( - self, - sep: StringSlice[sep_origin], - maxsplit: Int = -1, - ) raises -> List[ - String - ]: - """Split the string by a separator. - - Parameters: - sep_mut: Mutability of the `sep` string slice. - sep_origin: Origin of the `sep` string slice. - - Args: - sep: The string to split on. - maxsplit: The maximum amount of items to split from String. - Defaults to unlimited. - - Returns: - A List of Strings containing the input split by the separator. - - Raises: - If the separator is empty. - - Examples: - - ```mojo - # Splitting a space - _ = StringSlice("hello world").split(" ") # ["hello", "world"] - # Splitting adjacent separators - _ = StringSlice("hello,,world").split(",") # ["hello", "", "world"] - # Splitting with maxsplit - _ = StringSlice("1,2,3").split(",", 1) # ['1', '2,3'] - ``` - . - """ - var output = List[String]() - - var str_byte_len = self.byte_length() - 1 - var lhs = 0 - var rhs = 0 - var items = 0 - var sep_len = sep.byte_length() - if sep_len == 0: - raise Error("Separator cannot be empty.") - if str_byte_len < 0: - output.append(String("")) - - while lhs <= str_byte_len: - rhs = self.find(sep, lhs) - if rhs == -1: - output.append(String(self[lhs:])) - break - - if maxsplit > -1: - if items == maxsplit: - output.append(String(self[lhs:])) - break - items += 1 - - output.append(String(self[lhs:rhs])) - lhs = rhs + sep_len - - if self.endswith(sep) and (len(output) <= maxsplit or maxsplit == -1): - output.append(String("")) - - return output^ - - fn split( - self, sep: NoneType = None, maxsplit: Int = -1 - ) -> List[StringSlice[origin]]: - """Split the string by every Whitespace separator. - - Args: - sep: None. - maxsplit: The maximum amount of items to split from String. Defaults - to unlimited. - - Returns: - A List of Strings containing the input split by the separator. - - Examples: - - ```mojo - # Splitting an empty string or filled with whitespaces - _ = StringSlice(" ").split() # [] - _ = StringSlice("").split() # [] - - # Splitting a string with leading, trailing, and middle whitespaces - _ = StringSlice(" hello world ").split() # ["hello", "world"] - # Splitting adjacent universal newlines: - _ = StringSlice( - "hello \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world" - ).split() # ["hello", "world"] - ``` - . - """ - - return self._split_whitespace() - - fn _split_whitespace(self, maxsplit: Int = -1) -> List[StringSlice[origin]]: - fn num_bytes(b: UInt8) -> Int: - var flipped = ~b - return Int(count_leading_zeros(flipped) + (flipped >> 7)) - - var output = List[StringSlice[origin]]() - var str_byte_len = self.byte_length() - 1 - var lhs = 0 - var rhs = 0 - var items = 0 - while lhs <= str_byte_len: - try: - # Python adds all "whitespace chars" as one separator - # if no separator was specified - for s in self[lhs:].codepoint_slices(): - if not s.isspace(): - break - lhs += s.byte_length() - # if it went until the end of the String, then - # it should be sliced up until the original - # start of the whitespace which was already appended - if lhs - 1 == str_byte_len: - break - elif lhs == str_byte_len: - # if the last char is not whitespace - output.append(self[str_byte_len:]) - break - rhs = lhs + num_bytes(self.unsafe_ptr()[lhs]) - for s in self[ - lhs + num_bytes(self.unsafe_ptr()[lhs]) : - ].codepoint_slices(): - if s.isspace(): - break - rhs += s.byte_length() - - if maxsplit > -1: - if items == maxsplit: - output.append(self[lhs:]) - break - items += 1 - - output.append(self[lhs:rhs]) - lhs = rhs - except e: - return abort[List[StringSlice[origin]]]( - "unexpected exception during split()" - ) - - return output - @always_inline fn strip(self, chars: StringSlice) -> Self: """Return a copy of the string with leading and trailing characters @@ -1305,6 +1151,9 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( Returns: A slice containing the underlying sequence of encoded bytes. + + Notes: + This does not include the trailing null terminator. """ return self._slice @@ -1692,13 +1541,17 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( return Int(loc) - Int(self.unsafe_ptr()) - fn isspace(self) -> Bool: + fn isspace[single_character: Bool = False](self) -> Bool: """Determines whether every character in the given StringSlice is a python whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines): `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. + Parameters: + single_character: Whether to evaluate the stringslice as a single + unicode character (avoids overhead when already iterating). + Returns: True if the whole StringSlice is made up of whitespace characters listed above, otherwise False. @@ -1724,14 +1577,142 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( . """ - if self.byte_length() == 0: + fn _is_space_char(s: StringSlice) -> Bool: + # sorry for readability, but this has less overhead than memcmp + # highly performance sensitive code, benchmark before touching + alias ` ` = UInt8(ord(" ")) + alias `\t` = UInt8(ord("\t")) + alias `\n` = UInt8(ord("\n")) + alias `\r` = UInt8(ord("\r")) + alias `\f` = UInt8(ord("\f")) + alias `\v` = UInt8(ord("\v")) + alias `\x1c` = UInt8(ord("\x1c")) + alias `\x1d` = UInt8(ord("\x1d")) + alias `\x1e` = UInt8(ord("\x1e")) + + var no_null_len = s.byte_length() + var ptr = s.unsafe_ptr() + if likely(no_null_len == 1): + var c = ptr[0] + return ( + c == ` ` + or c == `\t` + or c == `\n` + or c == `\r` + or c == `\f` + or c == `\v` + or c == `\x1c` + or c == `\x1d` + or c == `\x1e` + ) + elif no_null_len == 2: + return ptr[0] == 0xC2 and ptr[1] == 0x85 # next_line: \x85 + elif no_null_len == 3: + # unicode line sep or paragraph sep: \u2028 , \u2029 + lastbyte = ptr[2] == 0xA8 or ptr[2] == 0xA9 + return ptr[0] == 0xE2 and ptr[1] == 0x80 and lastbyte return False - for s in self.codepoints(): - if not s.is_python_space(): - return False + @parameter + if single_character: + return _is_space_char(self) + else: + for s in self: + if not _is_space_char(s): + return False + return self.byte_length() != 0 - return True + @always_inline + fn split(self, sep: StringSlice, maxsplit: Int) -> List[Self]: + """Split the string by a separator. + + Args: + sep: The string to split on. + maxsplit: The maximum amount of items to split from String. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + ```mojo + # Splitting with maxsplit + _ = "1,2,3".split(",", maxsplit=1) # ['1', '2,3'] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",", maxsplit=1) # ['', '1,2,3,'] + _ = "123".split("", maxsplit=1) # ['', '123'] + ``` + . + """ + return _split[has_maxsplit=True](self, sep, maxsplit) + + @always_inline + fn split(self, sep: StringSlice) -> List[Self]: + """Split the string by a separator. + + Args: + sep: The string to split on. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + ```mojo + # Splitting a space + _ = "hello world".split(" ") # ["hello", "world"] + # Splitting adjacent separators + _ = "hello,,world".split(",") # ["hello", "", "world"] + # Splitting with starting or ending separators + _ = ",1,2,3,".split(",") # ['', '1', '2', '3', ''] + _ = "123".split("") # ['', '1', '2', '3', ''] + ``` + . + """ + return _split[has_maxsplit=False](self, sep, -1) + + @always_inline + fn split(self, *, maxsplit: Int) -> List[Self]: + """Split the string by every Whitespace separator. + + Args: + maxsplit: The maximum amount of items to split from String. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + ```mojo + # Splitting with maxsplit + _ = "1 2 3".split(maxsplit=1) # ['1', '2 3'] + ``` + . + """ + return _split[has_maxsplit=True](self, None, maxsplit) + + @always_inline + fn split(self, sep: NoneType = None) -> List[Self]: + """Split the string by every Whitespace separator. + + Args: + sep: None. + + Returns: + A List of Strings containing the input split by the separator. + + Examples: + ```mojo + # Splitting an empty string or filled with whitespaces + _ = " ".split() # [] + _ = "".split() # [] + # Splitting a string with leading, trailing, and middle whitespaces + _ = " hello world ".split() # ["hello", "world"] + # Splitting adjacent universal newlines: + _ = ( + "hello \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world" + ).split() # ["hello", "world"] + ``` + . + """ + return _split[has_maxsplit=False](self, sep, -1) fn isnewline[single_character: Bool = False](self) -> Bool: """Determines whether every character in the given StringSlice is a @@ -1746,7 +1727,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( Returns: True if the whole StringSlice is made up of whitespace characters - listed above, otherwise False. + listed above, otherwise False. """ var ptr = self.unsafe_ptr() @@ -2004,12 +1985,13 @@ fn _to_string_list[ @always_inline -fn _to_string_list[ - O: ImmutableOrigin, // +fn to_string_list[ + mut: Bool, O: Origin[mut], // ](items: List[StringSlice[O]]) -> List[String]: """Create a list of Strings **copying** the existing data. Parameters: + mut: The mutability of the data. O: The origin of the data. Args: @@ -2029,12 +2011,13 @@ fn _to_string_list[ @always_inline -fn _to_string_list[ - O: ImmutableOrigin, // +fn to_string_list[ + mut: Bool, O: Origin[mut], // ](items: List[Span[Byte, O]]) -> List[String]: """Create a list of Strings **copying** the existing data. Parameters: + mut: The mutability of the data. O: The origin of the data. Args: @@ -2289,3 +2272,110 @@ fn _memrmem[ if memcmp(haystack + i + 1, needle + 1, needle_len - 1) == 0: return haystack + i return UnsafePointer[Scalar[type]]() + + +fn _split[ + has_maxsplit: Bool +]( + src_str: StringSlice, + sep: StringSlice, + maxsplit: Int, + out output: List[__type_of(src_str)], +): + alias S = __type_of(src_str) + alias O = __type_of(src_str).origin + var ptr = src_str.unsafe_ptr().origin_cast[origin=MutableAnyOrigin]() + var sep_len = sep.byte_length() + if sep_len == 0: + var iterator = src_str.__iter__() + var i_len = len(iterator) + 2 + var out_ptr = UnsafePointer[S].alloc(i_len) + out_ptr[0] = S(ptr=ptr, length=0) + var i = 1 + for s in iterator: + out_ptr[i] = s + i += 1 + out_ptr[i] = S(ptr=ptr + i, length=0) + output = __type_of(output)(ptr=out_ptr, length=i_len, capacity=i_len) + return + + alias prealloc = 32 # guessing, Python's implementation uses 12 + var amnt = prealloc + + @parameter + if has_maxsplit: + amnt = maxsplit + 1 if maxsplit < prealloc else prealloc + output = __type_of(output)(capacity=amnt) + var str_byte_len = src_str.byte_length() + var lhs = 0 + var rhs = 0 + var items = 0 + # var str_span = src_str.as_bytes() # FIXME: solve #3526 with #3548 + # var sep_span = sep.as_bytes() # FIXME: solve #3526 with #3548 + + while lhs <= str_byte_len: + # FIXME(#3526): use str_span and sep_span + rhs = src_str.find(sep, lhs) + # if not found go to the end + rhs += -Int(rhs == -1) & (str_byte_len + 1) + + @parameter + if has_maxsplit: + rhs += -Int(items == maxsplit) & (str_byte_len - rhs) + items += 1 + + output.append(S(ptr=ptr + lhs, length=rhs - lhs)) + lhs = rhs + sep_len + + +fn _split[ + has_maxsplit: Bool +]( + src_str: StringSlice, + sep: NoneType, + maxsplit: Int, + out output: List[__type_of(src_str)], +): + alias S = __type_of(src_str) + alias O = __type_of(src_str).origin + alias prealloc = 32 # guessing, Python's implementation uses 12 + var amnt = prealloc + + @parameter + if has_maxsplit: + amnt = maxsplit + 1 if maxsplit < prealloc else prealloc + output = __type_of(output)(capacity=amnt) + var str_byte_len = src_str.byte_length() + var lhs = 0 + var rhs = 0 + var items = 0 + var ptr = src_str.unsafe_ptr().origin_cast[origin=MutableAnyOrigin]() + + @always_inline("nodebug") + fn _build_slice(p: UnsafePointer[Byte], start: Int, end: Int) -> S: + return S(ptr=p + start, length=end - start) + + while lhs <= str_byte_len: + # Python adds all "whitespace chars" as one separator + # if no separator was specified + for s in _build_slice(ptr, lhs, str_byte_len): + if not s.isspace[single_character=True](): + break + lhs += s.byte_length() + # if it went until the end of the String, then it should be sliced + # until the start of the whitespace which was already appended + if lhs == str_byte_len: + break + rhs = lhs + _utf8_first_byte_sequence_length(ptr[lhs]) + for s in _build_slice(ptr, rhs, str_byte_len): + if s.isspace[single_character=True](): + break + rhs += s.byte_length() + + @parameter + if has_maxsplit: + rhs += -Int(items == maxsplit) & (str_byte_len - rhs) + items += 1 + + output.append(S(ptr=ptr + lhs, length=rhs - lhs)) + lhs = rhs diff --git a/mojo/stdlib/src/os/path/path.mojo b/mojo/stdlib/src/os/path/path.mojo index bb9a14a9ca..3959ec03cd 100644 --- a/mojo/stdlib/src/os/path/path.mojo +++ b/mojo/stdlib/src/os/path/path.mojo @@ -26,7 +26,7 @@ from sys import has_neon, os_is_linux, os_is_macos, os_is_windows from memory import Span -from utils import StringSlice +from collections.string import StringSlice from .. import PathLike from .._linux_aarch64 import _lstat as _lstat_linux_arm @@ -129,7 +129,7 @@ fn expanduser[PathLike: os.PathLike, //](path: PathLike) raises -> String: var path_split = fspath.split(os.sep, 1) # If there is a properly formatted separator, return expanded fspath. if len(path_split) == 2: - return os.path.join(userhome, path_split[1]) + return os.path.join(userhome, String(path_split[1])) # Path was a single `~` character, return home path return userhome diff --git a/mojo/stdlib/src/testing/testing.mojo b/mojo/stdlib/src/testing/testing.mojo index 355d54653b..efe3a97328 100644 --- a/mojo/stdlib/src/testing/testing.mojo +++ b/mojo/stdlib/src/testing/testing.mojo @@ -36,7 +36,7 @@ from math import isclose from builtin._location import __call_location, _SourceLocation from memory import memcmp -from utils import StringSlice +from collections.string import StringSlice # ===----------------------------------------------------------------------=== # # Assertions diff --git a/mojo/stdlib/test/builtin/test_sort.mojo b/mojo/stdlib/test/builtin/test_sort.mojo index 606873454c..96dfb485c7 100644 --- a/mojo/stdlib/test/builtin/test_sort.mojo +++ b/mojo/stdlib/test/builtin/test_sort.mojo @@ -18,6 +18,7 @@ from sys import env_get_string, os_is_windows from builtin.sort import _quicksort, _small_sort, _SortWrapper from testing import assert_equal, assert_false, assert_true +from collections.string.string_slice import to_string_list fn random_numbers[ @@ -533,7 +534,7 @@ def test_sort_strings(): var text = ( _dir_of_current_file() / "test_file_dummy_input.txt" ).read_text() - var strings = text.split(" ") + var strings = to_string_list(text.split(" ")) sort(strings) assert_sorted_string(strings) diff --git a/mojo/stdlib/test/builtin/test_string_literal.mojo b/mojo/stdlib/test/builtin/test_string_literal.mojo index a8da563eb3..bee3b115ed 100644 --- a/mojo/stdlib/test/builtin/test_string_literal.mojo +++ b/mojo/stdlib/test/builtin/test_string_literal.mojo @@ -22,6 +22,7 @@ from testing import ( assert_raises, assert_true, ) +from collections.string import StringSlice from builtin.string_literal import ( _base64_encode, _base64_decode, @@ -417,64 +418,85 @@ def test_center(): def test_split(): - var d = "hello world".split() - assert_true(len(d) == 2) - assert_true(d[0] == "hello") - assert_true(d[1] == "world") - d = "hello \t\n\n\v\fworld".split("\n") - assert_true(len(d) == 3) - assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") + alias L = List[StringSlice[StaticConstantOrigin]] + # Should add all whitespace-like chars as one + # test all unicode separators + # 0 is to build a String with null terminator + alias next_line = List[UInt8](0xC2, 0x85, 0) + """TODO: \\x85""" + alias unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8, 0) + """TODO: \\u2028""" + alias unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9, 0) + """TODO: \\u2029""" + # TODO add line and paragraph separator as StringLiteral once unicode + # escape secuences are accepted + univ_sep_var = ( + " " + + "\t" + + "\n" + + "\r" + + "\v" + + "\f" + + "\x1c" + + "\x1d" + + "\x1e" + + String(buffer=next_line) + + String(buffer=unicode_line_sep) + + String(buffer=unicode_paragraph_sep) + ) + s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var + assert_equal(s.split(), L("hello", "world")) # should split into empty strings between separators - d = "1,,,3".split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") - d = "abababaaba".split("aba") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") - - # should split into maxsplit + 1 items - d = "1,2,3".split(",", 0) - assert_true(len(d) == 1) - assert_true(d[0] == "1,2,3") - d = "1,2,3".split(",", 1) - assert_true(len(d) == 2) - assert_true(d[0] == "1" and d[1] == "2,3") - + assert_equal("1,,,3".split(","), L("1", "", "", "3")) + assert_equal(",,,".split(","), L("", "", "", "")) + assert_equal(" a b ".split(" "), L("", "a", "b", "")) + assert_equal("abababaaba".split("aba"), L("", "b", "", "")) assert_true(len("".split()) == 0) assert_true(len(" ".split()) == 0) assert_true(len("".split(" ")) == 1) + assert_true(len(",".split(",")) == 2) assert_true(len(" ".split(" ")) == 2) + assert_true(len("".split("")) == 2) assert_true(len(" ".split(" ")) == 3) assert_true(len(" ".split(" ")) == 4) - with assert_raises(): - _ = "".split("") - - # Matches should be properly split in multiple case - var d2 = " " - var in2 = "modcon is coming soon" - var res2 = in2.split(d2) - assert_equal(len(res2), 4) - assert_equal(res2[0], "modcon") - assert_equal(res2[1], "is") - assert_equal(res2[2], "coming") - assert_equal(res2[3], "soon") + # should split into maxsplit + 1 items + assert_equal("1,2,3".split(",", 0), L("1,2,3")) + assert_equal("1,2,3".split(",", 1), L("1", "2,3")) + + # Split in middle + assert_equal("faang".split("n"), L("faa", "g")) # No match from the delimiter - var d3 = "x" - var in3 = "hello world" - var res3 = in3.split(d3) - assert_equal(len(res3), 1) - assert_equal(res3[0], "hello world") + assert_equal("hello world".split("x"), L("hello world")) # Multiple character delimiter - var d4 = "ll" - var in4 = "hello" - var res4 = in4.split(d4) - assert_equal(len(res4), 2) - assert_equal(res4[0], "he") - assert_equal(res4[1], "o") + assert_equal("hello".split("ll"), L("he", "o")) + + res = L("", "bb", "", "", "", "bbb", "") + assert_equal("abbaaaabbba".split("a"), res) + assert_equal("abbaaaabbba".split("a", 8), res) + s1 = "abbaaaabbba".split("a", 5) + assert_equal(s1, L("", "bb", "", "", "", "bbba")) + assert_equal("aaa".split("a", 0), L("aaa")) + assert_equal("a".split("a"), L("", "")) + assert_equal("1,2,3".split("3", 0), L("1,2,3")) + assert_equal("1,2,3".split("3", 1), L("1,2,", "")) + assert_equal("1,2,3,3".split("3", 2), L("1,2,", ",", "")) + assert_equal("1,2,3,3,3".split("3", 2), L("1,2,", ",", ",3")) + + assert_equal("Hello πŸ”₯!".split(), L("Hello", "πŸ”₯!")) + + s2 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split(" ") + assert_equal(s2, L("Π›ΠΎΡ€Π΅ΠΌ", "ипсум", "Π΄ΠΎΠ»ΠΎΡ€", "сит", "Π°ΠΌΠ΅Ρ‚")) + s3 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split("ΠΌ") + assert_equal(s3, L("Π›ΠΎΡ€Π΅", " ипсу", " Π΄ΠΎΠ»ΠΎΡ€ сит Π°", "Π΅Ρ‚")) + + assert_equal("123".split(""), L("", "1", "2", "3", "")) + assert_equal("".join("123".split("")), "123") + assert_equal(",1,2,3,".split(","), "123".split("")) + assert_equal(",".join("123".split("")), ",1,2,3,") def test_splitlines(): diff --git a/mojo/stdlib/test/collections/string/test_string.mojo b/mojo/stdlib/test/collections/string/test_string.mojo index 42e6adf1cc..e24129dd55 100644 --- a/mojo/stdlib/test/collections/string/test_string.mojo +++ b/mojo/stdlib/test/collections/string/test_string.mojo @@ -626,15 +626,7 @@ def test_rfind(): def test_split(): - # empty separators default to whitespace - var d = String("hello world").split() - assert_true(len(d) == 2) - assert_true(d[0] == "hello") - assert_true(d[1] == "world") - d = String("hello \t\n\n\v\fworld").split("\n") - assert_true(len(d) == 3) - assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") - + alias L = List[StringSlice[StaticConstantOrigin]] # Should add all whitespace-like chars as one # test all unicode separators # 0 is to build a String with null terminator @@ -662,116 +654,59 @@ def test_split(): String(buffer=unicode_paragraph_sep), ) ) - var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var - d = s.split() - assert_true(len(d) == 2) - assert_true(d[0] == "hello" and d[1] == "world") + s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var + assert_equal(s.split(), L("hello", "world")) # should split into empty strings between separators - d = String("1,,,3").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") - d = String(",,,").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "" and d[2] == "" and d[3] == "") - d = String(" a b ").split(" ") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "a" and d[2] == "b" and d[3] == "") - d = String("abababaaba").split("aba") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") + assert_equal("1,,,3".split(","), L("1", "", "", "3")) + assert_equal(",,,".split(","), L("", "", "", "")) + assert_equal(" a b ".split(" "), L("", "a", "b", "")) + assert_equal("abababaaba".split("aba"), L("", "b", "", "")) + assert_true(len("".split()) == 0) + assert_true(len(" ".split()) == 0) + assert_true(len("".split(" ")) == 1) + assert_true(len(",".split(",")) == 2) + assert_true(len(" ".split(" ")) == 2) + assert_true(len("".split("")) == 2) + assert_true(len(" ".split(" ")) == 3) + assert_true(len(" ".split(" ")) == 4) # should split into maxsplit + 1 items - d = String("1,2,3").split(",", 0) - assert_true(len(d) == 1) - assert_true(d[0] == "1,2,3") - d = String("1,2,3").split(",", 1) - assert_true(len(d) == 2) - assert_true(d[0] == "1" and d[1] == "2,3") - - assert_true(len(String("").split()) == 0) - assert_true(len(String(" ").split()) == 0) - assert_true(len(String("").split(" ")) == 1) - assert_true(len(String(" ").split(" ")) == 2) - assert_true(len(String(" ").split(" ")) == 3) - assert_true(len(String(" ").split(" ")) == 4) - - with assert_raises(): - _ = String("").split("") + assert_equal("1,2,3".split(",", 0), L("1,2,3")) + assert_equal("1,2,3".split(",", 1), L("1", "2,3")) # Split in middle - var d1 = String("n") - var in1 = String("faang") - var res1 = in1.split(d1) - assert_equal(len(res1), 2) - assert_equal(res1[0], "faa") - assert_equal(res1[1], "g") - - # Matches should be properly split in multiple case - var d2 = String(" ") - var in2 = String("modcon is coming soon") - var res2 = in2.split(d2) - assert_equal(len(res2), 4) - assert_equal(res2[0], "modcon") - assert_equal(res2[1], "is") - assert_equal(res2[2], "coming") - assert_equal(res2[3], "soon") + assert_equal("faang".split("n"), L("faa", "g")) # No match from the delimiter - var d3 = String("x") - var in3 = String("hello world") - var res3 = in3.split(d3) - assert_equal(len(res3), 1) - assert_equal(res3[0], "hello world") + assert_equal("hello world".split("x"), L("hello world")) # Multiple character delimiter - var d4 = String("ll") - var in4 = String("hello") - var res4 = in4.split(d4) - assert_equal(len(res4), 2) - assert_equal(res4[0], "he") - assert_equal(res4[1], "o") - - # related to #2879 - # TODO: replace string comparison when __eq__ is implemented for List - assert_equal( - String("abbaaaabbba").split("a").__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - String("abbaaaabbba").split("a", 8).__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - String("abbaaaabbba").split("a", 5).__str__(), - "['', 'bb', '', '', '', 'bbba']", - ) - assert_equal(String("aaa").split("a", 0).__str__(), "['aaa']") - assert_equal(String("a").split("a").__str__(), "['', '']") - assert_equal(String("1,2,3").split("3", 0).__str__(), "['1,2,3']") - assert_equal(String("1,2,3").split("3", 1).__str__(), "['1,2,', '']") - assert_equal(String("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']") - assert_equal( - String("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']" - ) - - var in5 = String("Hello πŸ”₯!") - var res5 = in5.split() - assert_equal(len(res5), 2) - assert_equal(res5[0], "Hello") - assert_equal(res5[1], "πŸ”₯!") - - var in6 = String("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") - var res6 = in6.split(" ") - assert_equal(len(res6), 5) - assert_equal(res6[0], "Π›ΠΎΡ€Π΅ΠΌ") - assert_equal(res6[1], "ипсум") - assert_equal(res6[2], "Π΄ΠΎΠ»ΠΎΡ€") - assert_equal(res6[3], "сит") - assert_equal(res6[4], "Π°ΠΌΠ΅Ρ‚") - - with assert_raises(contains="Separator cannot be empty."): - _ = String("1, 2, 3").split("") + assert_equal("hello".split("ll"), L("he", "o")) + + res = L("", "bb", "", "", "", "bbb", "") + assert_equal("abbaaaabbba".split("a"), res) + assert_equal("abbaaaabbba".split("a", 8), res) + s1 = "abbaaaabbba".split("a", 5) + assert_equal(s1, L("", "bb", "", "", "", "bbba")) + assert_equal("aaa".split("a", 0), L("aaa")) + assert_equal("a".split("a"), L("", "")) + assert_equal("1,2,3".split("3", 0), L("1,2,3")) + assert_equal("1,2,3".split("3", 1), L("1,2,", "")) + assert_equal("1,2,3,3".split("3", 2), L("1,2,", ",", "")) + assert_equal("1,2,3,3,3".split("3", 2), L("1,2,", ",", ",3")) + + assert_equal("Hello πŸ”₯!".split(), L("Hello", "πŸ”₯!")) + + s2 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split(" ") + assert_equal(s2, L("Π›ΠΎΡ€Π΅ΠΌ", "ипсум", "Π΄ΠΎΠ»ΠΎΡ€", "сит", "Π°ΠΌΠ΅Ρ‚")) + s3 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split("ΠΌ") + assert_equal(s3, L("Π›ΠΎΡ€Π΅", " ипсу", " Π΄ΠΎΠ»ΠΎΡ€ сит Π°", "Π΅Ρ‚")) + + assert_equal("123".split(""), L("", "1", "2", "3", "")) + assert_equal("".join("123".split("")), "123") + assert_equal(",1,2,3,".split(","), "123".split("")) + assert_equal(",".join("123".split("")), ",1,2,3,") def test_splitlines(): diff --git a/mojo/stdlib/test/collections/string/test_string_slice.mojo b/mojo/stdlib/test/collections/string/test_string_slice.mojo index 97a4df03bd..4c6e8b2a0b 100644 --- a/mojo/stdlib/test/collections/string/test_string_slice.mojo +++ b/mojo/stdlib/test/collections/string/test_string_slice.mojo @@ -698,15 +698,7 @@ def test_count_utf8_continuation_bytes(): def test_split(): - # empty separators default to whitespace - var d0 = StringSlice("hello world").split() - assert_true(len(d0) == 2) - assert_true(d0[0] == "hello") - assert_true(d0[1] == "world") - var d = StringSlice("hello \t\n\n\v\fworld").split(sep="\n") - assert_true(len(d) == 3) - assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") - + alias L = List[StringSlice[StaticConstantOrigin]] # Should add all whitespace-like chars as one # test all unicode separators # 0 is to build a String with null terminator @@ -718,134 +710,73 @@ def test_split(): """TODO: \\u2029""" # TODO add line and paragraph separator as StringLiteral once unicode # escape secuences are accepted - var univ_sep_var = ( - String( - " ", - "\t", - "\n", - "\r", - "\v", - "\f", - "\x1c", - "\x1d", - "\x1e", - String(buffer=next_line), - String(buffer=unicode_line_sep), - String(buffer=unicode_paragraph_sep), - ) + univ_sep_var = ( + " " + + "\t" + + "\n" + + "\r" + + "\v" + + "\f" + + "\x1c" + + "\x1d" + + "\x1e" + + String(buffer=next_line) + + String(buffer=unicode_line_sep) + + String(buffer=unicode_paragraph_sep) ) - var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var - var ds1 = StringSlice(s).split() - assert_true(len(ds1) == 2) - assert_true(ds1[0] == "hello" and ds1[1] == "world") + s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var + assert_equal(s.split(), L("hello", "world")) # should split into empty strings between separators - d = StringSlice("1,,,3").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "1" and d[1] == "" and d[2] == "" and d[3] == "3") - d = StringSlice(",,,").split(",") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "" and d[2] == "" and d[3] == "") - d = StringSlice(" a b ").split(" ") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "a" and d[2] == "b" and d[3] == "") - d = StringSlice("abababaaba").split("aba") - assert_true(len(d) == 4) - assert_true(d[0] == "" and d[1] == "b" and d[2] == "" and d[3] == "") + assert_equal("1,,,3".split(","), L("1", "", "", "3")) + assert_equal(",,,".split(","), L("", "", "", "")) + assert_equal(" a b ".split(" "), L("", "a", "b", "")) + assert_equal("abababaaba".split("aba"), L("", "b", "", "")) + assert_true(len("".split()) == 0) + assert_true(len(" ".split()) == 0) + assert_true(len("".split(" ")) == 1) + assert_true(len(",".split(",")) == 2) + assert_true(len(" ".split(" ")) == 2) + assert_true(len("".split("")) == 2) + assert_true(len(" ".split(" ")) == 3) + assert_true(len(" ".split(" ")) == 4) # should split into maxsplit + 1 items - d = StringSlice("1,2,3").split(",", 0) - assert_true(len(d) == 1) - assert_true(d[0] == "1,2,3") - d = StringSlice("1,2,3").split(",", 1) - assert_true(len(d) == 2) - assert_true(d[0] == "1" and d[1] == "2,3") - - assert_true(len(StringSlice("").split()) == 0) - assert_true(len(StringSlice(" ").split()) == 0) - assert_true(len(StringSlice("").split(" ")) == 1) - assert_true(len(StringSlice(" ").split(" ")) == 2) - assert_true(len(StringSlice(" ").split(" ")) == 3) - assert_true(len(StringSlice(" ").split(" ")) == 4) - - with assert_raises(): - _ = StringSlice("").split("") + assert_equal("1,2,3".split(",", 0), L("1,2,3")) + assert_equal("1,2,3".split(",", 1), L("1", "2,3")) # Split in middle - var d1 = StringSlice("n") - var in1 = StringSlice("faang") - var res1 = in1.split(d1) - assert_equal(len(res1), 2) - assert_equal(res1[0], "faa") - assert_equal(res1[1], "g") - - # Matches should be properly split in multiple case - var d2 = StringSlice(" ") - var in2 = StringSlice("modcon is coming soon") - var res2 = in2.split(d2) - assert_equal(len(res2), 4) - assert_equal(res2[0], "modcon") - assert_equal(res2[1], "is") - assert_equal(res2[2], "coming") - assert_equal(res2[3], "soon") + assert_equal("faang".split("n"), L("faa", "g")) # No match from the delimiter - var d3 = StringSlice("x") - var in3 = StringSlice("hello world") - var res3 = in3.split(d3) - assert_equal(len(res3), 1) - assert_equal(res3[0], "hello world") + assert_equal("hello world".split("x"), L("hello world")) # Multiple character delimiter - var d4 = StringSlice("ll") - var in4 = StringSlice("hello") - var res4 = in4.split(d4) - assert_equal(len(res4), 2) - assert_equal(res4[0], "he") - assert_equal(res4[1], "o") - - # related to #2879 - # TODO: replace string comparison when __eq__ is implemented for List - assert_equal( - StringSlice("abbaaaabbba").split("a").__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - StringSlice("abbaaaabbba").split("a", 8).__str__(), - "['', 'bb', '', '', '', 'bbb', '']", - ) - assert_equal( - StringSlice("abbaaaabbba").split("a", 5).__str__(), - "['', 'bb', '', '', '', 'bbba']", - ) - assert_equal(StringSlice("aaa").split("a", 0).__str__(), "['aaa']") - assert_equal(StringSlice("a").split("a").__str__(), "['', '']") - assert_equal(StringSlice("1,2,3").split("3", 0).__str__(), "['1,2,3']") - assert_equal(StringSlice("1,2,3").split("3", 1).__str__(), "['1,2,', '']") - assert_equal( - StringSlice("1,2,3,3").split("3", 2).__str__(), "['1,2,', ',', '']" - ) - assert_equal( - StringSlice("1,2,3,3,3").split("3", 2).__str__(), "['1,2,', ',', ',3']" - ) + assert_equal("hello".split("ll"), L("he", "o")) - var in5 = StringSlice("Hello πŸ”₯!") - var res5 = in5.split() - assert_equal(len(res5), 2) - assert_equal(res5[0], "Hello") - assert_equal(res5[1], "πŸ”₯!") + res = L("", "bb", "", "", "", "bbb", "") + assert_equal("abbaaaabbba".split("a"), res) + assert_equal("abbaaaabbba".split("a", 8), res) + s1 = "abbaaaabbba".split("a", 5) + assert_equal(s1, L("", "bb", "", "", "", "bbba")) + assert_equal("aaa".split("a", 0), L("aaa")) + assert_equal("a".split("a"), L("", "")) + assert_equal("1,2,3".split("3", 0), L("1,2,3")) + assert_equal("1,2,3".split("3", 1), L("1,2,", "")) + assert_equal("1,2,3,3".split("3", 2), L("1,2,", ",", "")) + assert_equal("1,2,3,3,3".split("3", 2), L("1,2,", ",", ",3")) - var in6 = StringSlice("Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚") - var res6 = in6.split(" ") - assert_equal(len(res6), 5) - assert_equal(res6[0], "Π›ΠΎΡ€Π΅ΠΌ") - assert_equal(res6[1], "ипсум") - assert_equal(res6[2], "Π΄ΠΎΠ»ΠΎΡ€") - assert_equal(res6[3], "сит") - assert_equal(res6[4], "Π°ΠΌΠ΅Ρ‚") + assert_equal("Hello πŸ”₯!".split(), L("Hello", "πŸ”₯!")) - with assert_raises(contains="Separator cannot be empty."): - _ = StringSlice("1, 2, 3").split("") + s2 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split(" ") + assert_equal(s2, L("Π›ΠΎΡ€Π΅ΠΌ", "ипсум", "Π΄ΠΎΠ»ΠΎΡ€", "сит", "Π°ΠΌΠ΅Ρ‚")) + s3 = "Π›ΠΎΡ€Π΅ΠΌ ипсум Π΄ΠΎΠ»ΠΎΡ€ сит Π°ΠΌΠ΅Ρ‚".split("ΠΌ") + assert_equal(s3, L("Π›ΠΎΡ€Π΅", " ипсу", " Π΄ΠΎΠ»ΠΎΡ€ сит Π°", "Π΅Ρ‚")) + + assert_equal("123".split(""), L("", "1", "2", "3", "")) + assert_equal("".join("123".split("")), "123") + assert_equal(",1,2,3,".split(","), "123".split("")) + assert_equal(",".join("123".split("")), ",1,2,3,") def test_splitlines(): @@ -1250,6 +1181,7 @@ def main(): test_combination_10_good_utf8_sequences() test_combination_10_good_10_bad_utf8_sequences() test_count_utf8_continuation_bytes() + test_count() test_split() test_splitlines() test_rstrip() diff --git a/mojo/stdlib/test/tempfile/test_tempfile.mojo b/mojo/stdlib/test/tempfile/test_tempfile.mojo index 0742f7cfa6..d28d2e0f38 100644 --- a/mojo/stdlib/test/tempfile/test_tempfile.mojo +++ b/mojo/stdlib/test/tempfile/test_tempfile.mojo @@ -41,7 +41,7 @@ def test_mkdtemp(): dir_name = mkdtemp(dir=Path().__fspath__()) assert_true(exists(dir_name), "Failed to create temporary directory") assert_true( - exists(Path() / dir_name.split(os.sep)[-1]), + exists(Path() / String(dir_name.split(os.sep)[-1])), "Expected directory to be created in cwd", ) os.rmdir(dir_name) diff --git a/mojo/stdlib/test/testing/test_assertion.mojo b/mojo/stdlib/test/testing/test_assertion.mojo index 68b3e17774..e4f93b752c 100644 --- a/mojo/stdlib/test/testing/test_assertion.mojo +++ b/mojo/stdlib/test/testing/test_assertion.mojo @@ -25,7 +25,7 @@ from testing import ( assert_true, ) -from utils import StringSlice +from collections.string import StringSlice from utils.numerics import inf, nan