-
Notifications
You must be signed in to change notification settings - Fork 932
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Migrate NVText Stemming APIs to pylibcudf (#17085)
Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17085
- Loading branch information
Showing
11 changed files
with
178 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,3 +11,4 @@ nvtext | |
ngrams_tokenize | ||
normalize | ||
replace | ||
stemmer |
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
======= | ||
stemmer | ||
======= | ||
|
||
.. automodule:: pylibcudf.nvtext.stemmer | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libcpp cimport bool | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.nvtext.stemmer cimport letter_type | ||
from pylibcudf.libcudf.types cimport size_type | ||
|
||
ctypedef fused ColumnOrSize: | ||
Column | ||
size_type | ||
|
||
cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices) | ||
|
||
cpdef Column porter_stemmer_measure(Column input) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libcpp cimport bool | ||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.nvtext.stemmer cimport ( | ||
is_letter as cpp_is_letter, | ||
letter_type, | ||
porter_stemmer_measure as cpp_porter_stemmer_measure, | ||
) | ||
from pylibcudf.libcudf.types cimport size_type | ||
|
||
|
||
cpdef Column is_letter( | ||
Column input, | ||
bool check_vowels, | ||
ColumnOrSize indices | ||
): | ||
""" | ||
Returns boolean column indicating if the character | ||
or characters at the provided character index or | ||
indices (respectively) are consonants or vowels | ||
For details, see :cpp:func:`is_letter` | ||
Parameters | ||
---------- | ||
input : Column | ||
Input strings | ||
check_vowels : bool | ||
If true, the check is for vowels. Otherwise the check is | ||
for consonants. | ||
indices : Union[Column, size_type] | ||
The character position(s) to check in each string | ||
Returns | ||
------- | ||
Column | ||
New boolean column. | ||
""" | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = cpp_is_letter( | ||
input.view(), | ||
letter_type.VOWEL if check_vowels else letter_type.CONSONANT, | ||
indices if ColumnOrSize is size_type else indices.view() | ||
) | ||
|
||
return Column.from_libcudf(move(c_result)) | ||
|
||
|
||
cpdef Column porter_stemmer_measure(Column input): | ||
""" | ||
Returns the Porter Stemmer measurements of a strings column. | ||
For details, see :cpp:func:`porter_stemmer_measure` | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings column of words to measure | ||
Returns | ||
------- | ||
Column | ||
New column of measure values | ||
""" | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = cpp_porter_stemmer_measure(input.view()) | ||
|
||
return Column.from_libcudf(move(c_result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pylibcudf as plc | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def input_col(): | ||
arr = ["trouble", "toy", "syzygy"] | ||
return pa.array(arr) | ||
|
||
|
||
@pytest.mark.parametrize("check_vowels", [True, False]) | ||
@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) | ||
def test_is_letter(input_col, check_vowels, indices): | ||
def is_letter(s, i, check): | ||
vowels = "aeiouy" | ||
return (s[i] in vowels) == check | ||
|
||
result = plc.nvtext.stemmer.is_letter( | ||
plc.interop.from_arrow(input_col), | ||
check_vowels, | ||
plc.interop.from_arrow(pa.array(indices)) | ||
if isinstance(indices, list) | ||
else indices, | ||
) | ||
expected = pa.array( | ||
[ | ||
is_letter( | ||
s, | ||
indices[i] if isinstance(indices, list) else indices, | ||
check_vowels, | ||
) | ||
for i, s in enumerate(input_col.to_pylist()) | ||
] | ||
) | ||
assert_column_eq(result, expected) | ||
|
||
|
||
def test_porter_stemmer_measure(input_col): | ||
result = plc.nvtext.stemmer.porter_stemmer_measure( | ||
plc.interop.from_arrow(input_col), | ||
) | ||
expected = pa.array([1, 1, 2], type=pa.int32()) | ||
assert_column_eq(result, expected) |