Skip to content

Commit

Permalink
fix wrong whitespace designation
Browse files Browse the repository at this point in the history
Signed-off-by: martinvuyk <martin.vuyklop@gmail.com>
  • Loading branch information
martinvuyk committed Feb 1, 2025
1 parent b367ba8 commit c29f8b5
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 17 deletions.
47 changes: 36 additions & 11 deletions stdlib/src/builtin/char.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -359,30 +359,55 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable):
alias unicode_line_sep = Char.from_u32(0x2028).value()
alias unicode_paragraph_sep = Char.from_u32(0x2029).value()

return self.is_posix_space() or self in (
return self.is_ascii_space() or self in (
next_line,
unicode_line_sep,
unicode_paragraph_sep,
)

fn is_posix_space(self) -> Bool:
"""Returns True if this `Char` is a **space** character according to the
[POSIX locale][1].
"""Returns True if this `Char` is a **space** (aka. whitespace)
character according to the [POSIX locale](
https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
): `" \\t\\n\\v\\f\\r"`.
The POSIX locale is also known as the C locale.
Returns:
True iff the character is one of the whitespace characters listed
above.
Notes:
The POSIX locale is also known as the C locale.
"""

# ASCII char
var c = UInt8(Int(self))

# NOTE: a global LUT doesn't work at compile time so we can't use it here.
alias ` ` = UInt8(ord(" "))
alias `\t` = UInt8(ord("\t"))
alias `\n` = UInt8(ord("\n"))
alias `\r` = UInt8(ord("\r"))
alias `\f` = UInt8(ord("\f"))
alias `\v` = UInt8(ord("\v"))

[1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
# This compiles to something very clever that's even faster than a LUT.
return self.is_ascii() and (
c == ` `
or c == `\t`
or c == `\n`
or c == `\r`
or c == `\f`
or c == `\v`
)

This only respects the default "C" locale, i.e. returns True only if the
character specified is one of " \\t\\n\\v\\f\\r". For semantics similar
to Python, use `String.isspace()`.
fn is_ascii_space(self) -> Bool:
"""Determines whether the given character is an ASCII whitespace
character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`.
Returns:
True iff the character is one of the whitespace characters listed
above.
"""
if not self.is_ascii():
return False

# ASCII char
var c = UInt8(Int(self))
Expand All @@ -399,7 +424,7 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable):
alias `\x1e` = UInt8(ord("\x1e"))

# This compiles to something very clever that's even faster than a LUT.
return (
return self.is_ascii() and (
c == ` `
or c == `\t`
or c == `\n`
Expand Down
8 changes: 4 additions & 4 deletions stdlib/src/collections/string/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,13 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:
elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]:
result += ord_current - ord_letter_min[1] + 10
found_valid_chars_after_start = True
elif Char(UInt8(ord_current)).is_posix_space():
elif Char(UInt8(ord_current)).is_ascii_space():
has_space_after_number = True
start = pos + 1
break
else:
raise Error(_str_to_base_error(base, str_slice))
if pos + 1 < str_len and not Char(buff[pos + 1]).is_posix_space():
if pos + 1 < str_len and not Char(buff[pos + 1]).is_ascii_space():
var nextresult = result * real_base
if nextresult < result:
raise Error(
Expand All @@ -310,7 +310,7 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:

if has_space_after_number:
for pos in range(start, str_len):
if not Char(buff[pos]).is_posix_space():
if not Char(buff[pos]).is_ascii_space():
raise Error(_str_to_base_error(base, str_slice))
if is_negative:
result = -result
Expand All @@ -332,7 +332,7 @@ fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool):
"""
var buff = str_slice.unsafe_ptr()
var start: Int = 0
while start < str_len and Char(buff[start]).is_posix_space():
while start < str_len and Char(buff[start]).is_ascii_space():
start += 1
var p: Bool = buff[start] == ord("+")
var n: Bool = buff[start] == ord("-")
Expand Down
4 changes: 2 additions & 2 deletions stdlib/src/collections/string/string_slice.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
# if not s.isspace():
# break
# r_idx -= 1
while r_idx > 0 and Char(self.as_bytes()[r_idx - 1]).is_posix_space():
while r_idx > 0 and Char(self.as_bytes()[r_idx - 1]).is_ascii_space():
r_idx -= 1
return Self(unsafe_from_utf8=self.as_bytes()[:r_idx])

Expand Down Expand Up @@ -1019,7 +1019,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
# l_idx += 1
while (
l_idx < self.byte_length()
and Char(self.as_bytes()[l_idx]).is_posix_space()
and Char(self.as_bytes()[l_idx]).is_ascii_space()
):
l_idx += 1
return Self(unsafe_from_utf8=self.as_bytes()[l_idx:])
Expand Down
29 changes: 29 additions & 0 deletions stdlib/test/builtin/test_char.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,34 @@ def test_char_is_posix_space():
assert_false(Char.ord("n").is_posix_space())
assert_false(Char.ord("z").is_posix_space())
assert_false(Char.ord(".").is_posix_space())
assert_false(Char.ord("\x1c").is_posix_space())
assert_false(Char.ord("\x1d").is_posix_space())
assert_false(Char.ord("\x1e").is_posix_space())


def test_char_is_ascii_space():
# checking true cases
assert_true(Char.ord(" ").is_ascii_space())
assert_true(Char.ord("\n").is_ascii_space())
assert_true(Char.ord("\n").is_ascii_space())
assert_true(Char.ord("\t").is_ascii_space())
assert_true(Char.ord("\r").is_ascii_space())
assert_true(Char.ord("\v").is_ascii_space())
assert_true(Char.ord("\f").is_ascii_space())
assert_true(Char.ord("\x1c").is_ascii_space())
assert_true(Char.ord("\x1d").is_ascii_space())
assert_true(Char.ord("\x1e").is_ascii_space())

# Checking false cases
assert_false(Char.ord("a").is_ascii_space())
assert_false(Char.ord("a").is_ascii_space())
assert_false(Char.ord("u").is_ascii_space())
assert_false(Char.ord("s").is_ascii_space())
assert_false(Char.ord("t").is_ascii_space())
assert_false(Char.ord("i").is_ascii_space())
assert_false(Char.ord("n").is_ascii_space())
assert_false(Char.ord("z").is_ascii_space())
assert_false(Char.ord(".").is_ascii_space())


def test_char_is_lower():
Expand Down Expand Up @@ -234,6 +262,7 @@ def main():
test_char_formatting()
test_char_properties()
test_char_is_posix_space()
test_char_is_ascii_space()
test_char_is_lower()
test_char_is_upper()
test_char_is_digit()
Expand Down

0 comments on commit c29f8b5

Please sign in to comment.