Skip to content

Commit

Permalink
Migrate remaining nvtext NGrams APIs to pylibcudf (#17070)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - https://github.com/brandon-b-miller

URL: #17070
  • Loading branch information
Matt711 authored Oct 16, 2024
1 parent 7bcfc87 commit 3420c71
Show file tree
Hide file tree
Showing 9 changed files with 135 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ nvtext
generate_ngrams
jaccard
minhash
ngrams_tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
===============
ngrams_tokenize
===============

.. automodule:: pylibcudf.nvtext.ngrams_tokenize
:members:
46 changes: 10 additions & 36 deletions python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,22 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
ngrams_tokenize as cpp_ngrams_tokenize,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf import nvtext


@acquire_spill_lock()
def ngrams_tokenize(
Column strings,
Column input,
int ngrams,
object py_delimiter,
object py_separator
):

cdef DeviceScalar delimiter = py_delimiter.device_value
cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef const string_scalar* c_separator = <const string_scalar*>separator\
.get_raw_ptr()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_ngrams_tokenize(
c_strings,
c_ngrams,
c_delimiter[0],
c_separator[0]
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.ngrams_tokenize.ngrams_tokenize(
input.to_pylibcudf(mode="read"),
ngrams,
py_delimiter.device_value.c_value,
py_separator.device_value.c_value
)
return Column.from_pylibcudf(result)
4 changes: 3 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx
)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
11 changes: 9 additions & 2 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams, jaccard, minhash
from . cimport (
edit_distance,
generate_ngrams,
jaccard,
minhash,
ngrams_tokenize,
)

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash"
"minhash",
"ngrams_tokenize"
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams, jaccard, minhash
from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash",
"ngrams_tokenize",
]
13 changes: 13 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column ngrams_tokenize(
Column input,
size_type ngrams,
Scalar delimiter,
Scalar separator
)
54 changes: 54 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
ngrams_tokenize as cpp_ngrams_tokenize,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column ngrams_tokenize(
Column input,
size_type ngrams,
Scalar delimiter,
Scalar separator
):
"""
Returns a single column of strings by tokenizing the input strings column
and then producing ngrams of each string.
For details, see :cpp:func:`ngrams_tokenize`
Parameters
----------
input : Column
Input strings
ngrams : size_type
The ngram number to generate
delimiter : Scalar
UTF-8 characters used to separate each string into tokens.
An empty string will separate tokens using whitespace.
separator : Scalar
The string to use for separating ngram tokens
Returns
-------
Column
New strings columns of tokens
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_ngrams_tokenize(
input.view(),
ngrams,
dereference(<const string_scalar*>delimiter.get()),
dereference(<const string_scalar*>separator.get()),
)
return Column.from_libcudf(move(c_result))
37 changes: 37 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"]
return pa.array(arr)


@pytest.mark.parametrize("ngrams", [2, 3])
@pytest.mark.parametrize("delim", ["*", " ", "-"])
@pytest.mark.parametrize("sep", ["_", "&", ","])
def test_ngrams_tokenize(input_col, ngrams, delim, sep):
def ngrams_tokenize(strings, ngrams, delim, sep):
tokens = []
for s in strings:
ss = s.split(delim)
for i in range(len(ss) - ngrams + 1):
token = sep.join(ss[i : i + ngrams])
tokens.append(token)
return tokens

result = plc.nvtext.ngrams_tokenize.ngrams_tokenize(
plc.interop.from_arrow(input_col),
ngrams,
plc.interop.from_arrow(pa.scalar(delim)),
plc.interop.from_arrow(pa.scalar(sep)),
)
expected = pa.array(
ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep)
)
assert_column_eq(result, expected)

0 comments on commit 3420c71

Please sign in to comment.