Skip to content

Commit

Permalink
refactor: remove large tests and fix docstring
Browse files Browse the repository at this point in the history
  • Loading branch information
f4t4nt committed Jan 14, 2025
1 parent e5a291c commit 9915afe
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 113 deletions.
8 changes: 4 additions & 4 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3629,7 +3629,7 @@ def concat(self, other: Expression) -> Expression:
return Expression._from_pyexpr(native.binary_concat(self._expr, other_expr._expr))

def slice(self, start: Expression | int, length: Expression | int | None = None) -> Expression:
r"""Returns a substring of each binary string.
r"""Returns a slice of each binary string.
Example:
>>> import daft
Expand All @@ -3651,11 +3651,11 @@ def slice(self, start: Expression | int, length: Expression | int | None = None)
(Showing first 3 of 3 rows)
Args:
start: The starting position (0-based) of the substring.
length: The length of the substring. If None, returns all characters from start to the end.
start: The starting position (0-based) of the slice.
length: The length of the slice. If None, returns all characters from start to the end.
Returns:
A new expression representing the substring.
A new expression representing the slice.
"""
start_expr = Expression._to_expression(start)
length_expr = Expression._to_expression(length)
Expand Down
42 changes: 0 additions & 42 deletions tests/table/binary/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,48 +109,6 @@ def test_binary_concat(
assert result.to_pydict() == {"a": expected_result}


def test_binary_concat_large() -> None:
# Test concatenating large binary strings
large_binary = b"x" * 1000
table = MicroPartition.from_pydict(
{"a": [large_binary, b"small", large_binary], "b": [large_binary, large_binary, b"small"]}
)
result = table.eval_expression_list([col("a").binary.concat(col("b"))])
assert result.to_pydict() == {
"a": [
b"x" * 2000, # Two large binaries concatenated
b"small" + (b"x" * 1000), # Small + large
(b"x" * 1000) + b"small", # Large + small
]
}


def test_binary_concat_very_large() -> None:
# Test with very large binary sequences
data_a = [
b"x" * 1_000_000, # 1MB of 'x'
b"\x00" * 1_000_000, # 1MB of null bytes
b"Hello\x00World" * 100_000, # Repeated pattern
b"\xe2\x98\x83" * 333_333, # Many snowmen
]
data_b = [
b"y" * 1_000_000, # 1MB of 'y'
b"\xff" * 1_000_000, # 1MB of high bytes
b"Test\x00Case" * 100_000, # Different pattern
b"\xf0\x9f\x98\x89" * 250_000, # Many winking faces
]
expected = [
(b"x" * 1_000_000) + (b"y" * 1_000_000),
(b"\x00" * 1_000_000) + (b"\xff" * 1_000_000),
(b"Hello\x00World" * 100_000) + (b"Test\x00Case" * 100_000),
(b"\xe2\x98\x83" * 333_333) + (b"\xf0\x9f\x98\x89" * 250_000),
]

table = MicroPartition.from_pydict({"a": data_a, "b": data_b})
result = table.eval_expression_list([col("a").binary.concat(col("b"))])
assert result.to_pydict() == {"a": expected}


@pytest.mark.parametrize(
"input_data,literal,expected_result",
[
Expand Down
42 changes: 0 additions & 42 deletions tests/table/utf8/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,48 +107,6 @@ def test_utf8_concat(input_a: list[str | None], input_b: list[str | None], expec
assert result.to_pydict() == {"a": expected_result}


def test_utf8_concat_large() -> None:
# Test concatenating large strings
large_string = "x" * 1000
table = MicroPartition.from_pydict(
{"a": [large_string, "small", large_string], "b": [large_string, large_string, "small"]}
)
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {
"a": [
"x" * 2000, # Two large strings concatenated
"small" + ("x" * 1000), # Small + large
("x" * 1000) + "small", # Large + small
]
}


def test_utf8_concat_very_large() -> None:
# Test with very large string sequences
data_a = [
"x" * 1_000_000, # 1MB of 'x'
"\u0000" * 1_000_000, # 1MB of null characters
"Hello\u0000World" * 100_000, # Repeated pattern
"☃" * 333_333, # Many snowmen
]
data_b = [
"y" * 1_000_000, # 1MB of 'y'
"z" * 1_000_000, # 1MB of 'z'
"Test\u0000Case" * 100_000, # Different pattern
"😉" * 250_000, # Many winking faces
]
expected = [
("x" * 1_000_000) + ("y" * 1_000_000),
("\u0000" * 1_000_000) + ("z" * 1_000_000),
("Hello\u0000World" * 100_000) + ("Test\u0000Case" * 100_000),
("☃" * 333_333) + ("😉" * 250_000),
]

table = MicroPartition.from_pydict({"a": data_a, "b": data_b})
result = table.eval_expression_list([col("a").str.concat(col("b"))])
assert result.to_pydict() == {"a": expected}


@pytest.mark.parametrize(
"input_data,literal,expected_result",
[
Expand Down
26 changes: 1 addition & 25 deletions tests/table/utf8/test_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
["Hello", None, "", "\u0000", None, "Test", ""],
[5, None, 0, 1, None, 4, 0],
),
# Very large strings
# Large strings
(
["x" * 1000, "y" * 10000, "z" * 100000],
[1000, 10000, 100000],
Expand Down Expand Up @@ -99,27 +99,3 @@ def test_utf8_length(input_data: list[str | None], expected_lengths: list[int |
table = MicroPartition.from_pydict({"col": input_data})
result = table.eval_expression_list([col("col").str.length()])
assert result.to_pydict() == {"col": expected_lengths}


def test_utf8_length_large_sequences() -> None:
# Test with very large string sequences
large_data = [
"x" * 1_000_000, # 1MB of 'x'
"\u0000" * 1_000_000, # 1MB of null characters
"Hello\u0000World" * 100_000, # Repeated pattern with null
"☃" * 333_333, # Many snowmen
None, # Null value
"", # Empty string
]
expected_lengths = [
1_000_000,
1_000_000,
1_100_000, # 11 chars * 100_000
333_333, # 1 char * 333_333
None,
0,
]

table = MicroPartition.from_pydict({"col": large_data})
result = table.eval_expression_list([col("col").str.length()])
assert result.to_pydict() == {"col": expected_lengths}

0 comments on commit 9915afe

Please sign in to comment.