diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 44177de762..34eef84af4 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -3629,7 +3629,7 @@ def concat(self, other: Expression) -> Expression: return Expression._from_pyexpr(native.binary_concat(self._expr, other_expr._expr)) def slice(self, start: Expression | int, length: Expression | int | None = None) -> Expression: - r"""Returns a substring of each binary string. + r"""Returns a slice of each binary string. Example: >>> import daft @@ -3651,11 +3651,11 @@ def slice(self, start: Expression | int, length: Expression | int | None = None) (Showing first 3 of 3 rows) Args: - start: The starting position (0-based) of the substring. - length: The length of the substring. If None, returns all characters from start to the end. + start: The starting position (0-based) of the slice. + length: The length of the slice. If None, returns all characters from start to the end. Returns: - A new expression representing the substring. + A new expression representing the slice. """ start_expr = Expression._to_expression(start) length_expr = Expression._to_expression(length) diff --git a/tests/table/binary/test_concat.py b/tests/table/binary/test_concat.py index 65024c70f7..cc603d5d39 100644 --- a/tests/table/binary/test_concat.py +++ b/tests/table/binary/test_concat.py @@ -109,48 +109,6 @@ def test_binary_concat( assert result.to_pydict() == {"a": expected_result} -def test_binary_concat_large() -> None: - # Test concatenating large binary strings - large_binary = b"x" * 1000 - table = MicroPartition.from_pydict( - {"a": [large_binary, b"small", large_binary], "b": [large_binary, large_binary, b"small"]} - ) - result = table.eval_expression_list([col("a").binary.concat(col("b"))]) - assert result.to_pydict() == { - "a": [ - b"x" * 2000, # Two large binaries concatenated - b"small" + (b"x" * 1000), # Small + large - (b"x" * 1000) + b"small", # Large + small - ] - } - - -def test_binary_concat_very_large() -> None: - # Test with very large binary sequences - data_a = [ - b"x" * 1_000_000, # 1MB of 'x' - b"\x00" * 1_000_000, # 1MB of null bytes - b"Hello\x00World" * 100_000, # Repeated pattern - b"\xe2\x98\x83" * 333_333, # Many snowmen - ] - data_b = [ - b"y" * 1_000_000, # 1MB of 'y' - b"\xff" * 1_000_000, # 1MB of high bytes - b"Test\x00Case" * 100_000, # Different pattern - b"\xf0\x9f\x98\x89" * 250_000, # Many winking faces - ] - expected = [ - (b"x" * 1_000_000) + (b"y" * 1_000_000), - (b"\x00" * 1_000_000) + (b"\xff" * 1_000_000), - (b"Hello\x00World" * 100_000) + (b"Test\x00Case" * 100_000), - (b"\xe2\x98\x83" * 333_333) + (b"\xf0\x9f\x98\x89" * 250_000), - ] - - table = MicroPartition.from_pydict({"a": data_a, "b": data_b}) - result = table.eval_expression_list([col("a").binary.concat(col("b"))]) - assert result.to_pydict() == {"a": expected} - - @pytest.mark.parametrize( "input_data,literal,expected_result", [ diff --git a/tests/table/utf8/test_concat.py b/tests/table/utf8/test_concat.py index 35af00398f..d3bda39d8e 100644 --- a/tests/table/utf8/test_concat.py +++ b/tests/table/utf8/test_concat.py @@ -107,48 +107,6 @@ def test_utf8_concat(input_a: list[str | None], input_b: list[str | None], expec assert result.to_pydict() == {"a": expected_result} -def test_utf8_concat_large() -> None: - # Test concatenating large strings - large_string = "x" * 1000 - table = MicroPartition.from_pydict( - {"a": [large_string, "small", large_string], "b": [large_string, large_string, "small"]} - ) - result = table.eval_expression_list([col("a").str.concat(col("b"))]) - assert result.to_pydict() == { - "a": [ - "x" * 2000, # Two large strings concatenated - "small" + ("x" * 1000), # Small + large - ("x" * 1000) + "small", # Large + small - ] - } - - -def test_utf8_concat_very_large() -> None: - # Test with very large string sequences - data_a = [ - "x" * 1_000_000, # 1MB of 'x' - "\u0000" * 1_000_000, # 1MB of null characters - "Hello\u0000World" * 100_000, # Repeated pattern - "☃" * 333_333, # Many snowmen - ] - data_b = [ - "y" * 1_000_000, # 1MB of 'y' - "z" * 1_000_000, # 1MB of 'z' - "Test\u0000Case" * 100_000, # Different pattern - "😉" * 250_000, # Many winking faces - ] - expected = [ - ("x" * 1_000_000) + ("y" * 1_000_000), - ("\u0000" * 1_000_000) + ("z" * 1_000_000), - ("Hello\u0000World" * 100_000) + ("Test\u0000Case" * 100_000), - ("☃" * 333_333) + ("😉" * 250_000), - ] - - table = MicroPartition.from_pydict({"a": data_a, "b": data_b}) - result = table.eval_expression_list([col("a").str.concat(col("b"))]) - assert result.to_pydict() == {"a": expected} - - @pytest.mark.parametrize( "input_data,literal,expected_result", [ diff --git a/tests/table/utf8/test_length.py b/tests/table/utf8/test_length.py index 30cf2a712e..8925d493fc 100644 --- a/tests/table/utf8/test_length.py +++ b/tests/table/utf8/test_length.py @@ -31,7 +31,7 @@ ["Hello", None, "", "\u0000", None, "Test", ""], [5, None, 0, 1, None, 4, 0], ), - # Very large strings + # Large strings ( ["x" * 1000, "y" * 10000, "z" * 100000], [1000, 10000, 100000], @@ -99,27 +99,3 @@ def test_utf8_length(input_data: list[str | None], expected_lengths: list[int | table = MicroPartition.from_pydict({"col": input_data}) result = table.eval_expression_list([col("col").str.length()]) assert result.to_pydict() == {"col": expected_lengths} - - -def test_utf8_length_large_sequences() -> None: - # Test with very large string sequences - large_data = [ - "x" * 1_000_000, # 1MB of 'x' - "\u0000" * 1_000_000, # 1MB of null characters - "Hello\u0000World" * 100_000, # Repeated pattern with null - "☃" * 333_333, # Many snowmen - None, # Null value - "", # Empty string - ] - expected_lengths = [ - 1_000_000, - 1_000_000, - 1_100_000, # 11 chars * 100_000 - 333_333, # 1 char * 333_333 - None, - 0, - ] - - table = MicroPartition.from_pydict({"col": large_data}) - result = table.eval_expression_list([col("col").str.length()]) - assert result.to_pydict() == {"col": expected_lengths}