Skip to content

Commit

Permalink
Migrate NVText Stemming APIs to pylibcudf (#17085)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #17085
  • Loading branch information
Matt711 authored Oct 24, 2024
1 parent 0287972 commit d7cdf44
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 43 deletions.
8 changes: 4 additions & 4 deletions cpp/include/nvtext/stemmer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ enum class letter_type {
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* b1 = is_letter(st, VOWEL, 1)
* b1 is now [false, true, true]
* @endcode
Expand All @@ -62,7 +62,7 @@ enum class letter_type {
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
* b2 is now [false, true, false]
* @endcode
Expand Down Expand Up @@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* ix = [3, 1, 4]
* b1 = is_letter(st, VOWEL, ix)
* b1 is now [true, true, false]
Expand All @@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
*
* @code{.pseudo}
* Example:
* st = ["trouble", "toy", "sygyzy"]
* st = ["trouble", "toy", "syzygy"]
* ix = [3, -2, 4] // 2nd to last character in st[1] is checked
* b2 = is_letter(st, CONSONANT, ix)
* b2 is now [false, false, true]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ nvtext
ngrams_tokenize
normalize
replace
stemmer
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
stemmer
=======

.. automodule:: pylibcudf.nvtext.stemmer
:members:
56 changes: 21 additions & 35 deletions python/cudf/cudf/_lib/nvtext/stemmer.pyx
Original file line number Diff line number Diff line change
@@ -1,24 +1,19 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from enum import IntEnum

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.nvtext.stemmer cimport (
is_letter as cpp_is_letter,
letter_type,
porter_stemmer_measure as cpp_porter_stemmer_measure,
underlying_type_t_letter_type,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from pylibcudf import nvtext


class LetterType(IntEnum):
CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
Expand All @@ -27,43 +22,34 @@ class LetterType(IntEnum):

@acquire_spill_lock()
def porter_stemmer_measure(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_porter_stemmer_measure(c_strings))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
nvtext.stemmer.porter_stemmer_measure(
strings.to_pylibcudf(mode="read"),
)
)


@acquire_spill_lock()
def is_letter(Column strings,
object ltype,
size_type index):
cdef column_view c_strings = strings.view()
cdef letter_type c_ltype = <letter_type>(
<underlying_type_t_letter_type> ltype
return Column.from_pylibcudf(
nvtext.stemmer.is_letter(
strings.to_pylibcudf(mode="read"),
ltype==LetterType.VOWEL,
index,
)
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_is_letter(c_strings, c_ltype, index))

return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def is_letter_multi(Column strings,
object ltype,
Column indices):
cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef letter_type c_ltype = <letter_type>(
<underlying_type_t_letter_type> ltype
return Column.from_pylibcudf(
nvtext.stemmer.is_letter(
strings.to_pylibcudf(mode="read"),
ltype==LetterType.VOWEL,
indices.to_pylibcudf(mode="read"),
)
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))

return Column.from_unique_ptr(move(c_result))
7 changes: 4 additions & 3 deletions python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.types cimport size_type


cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
ctypedef enum letter_type:
CONSONANT 'nvtext::letter_type::CONSONANT'
VOWEL 'nvtext::letter_type::VOWEL'
cpdef enum class letter_type:
CONSONANT
VOWEL

cdef unique_ptr[column] porter_stemmer_measure(
const column_view & strings
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from . cimport (
ngrams_tokenize,
normalize,
replace,
stemmer,
)

__all__ = [
Expand All @@ -18,4 +19,5 @@ __all__ = [
"ngrams_tokenize",
"normalize",
"replace",
"stemmer",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ngrams_tokenize,
normalize,
replace,
stemmer,
)

__all__ = [
Expand All @@ -18,4 +19,5 @@
"ngrams_tokenize",
"normalize",
"replace",
"stemmer",
]
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
from pylibcudf.libcudf.types cimport size_type

ctypedef fused ColumnOrSize:
Column
size_type

cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)

cpdef Column porter_stemmer_measure(Column input)
76 changes: 76 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.stemmer cimport (
is_letter as cpp_is_letter,
letter_type,
porter_stemmer_measure as cpp_porter_stemmer_measure,
)
from pylibcudf.libcudf.types cimport size_type


cpdef Column is_letter(
Column input,
bool check_vowels,
ColumnOrSize indices
):
"""
Returns boolean column indicating if the character
or characters at the provided character index or
indices (respectively) are consonants or vowels
For details, see :cpp:func:`is_letter`
Parameters
----------
input : Column
Input strings
check_vowels : bool
If true, the check is for vowels. Otherwise the check is
for consonants.
indices : Union[Column, size_type]
The character position(s) to check in each string
Returns
-------
Column
New boolean column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_is_letter(
input.view(),
letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
indices if ColumnOrSize is size_type else indices.view()
)

return Column.from_libcudf(move(c_result))


cpdef Column porter_stemmer_measure(Column input):
"""
Returns the Porter Stemmer measurements of a strings column.
For details, see :cpp:func:`porter_stemmer_measure`
Parameters
----------
input : Column
Strings column of words to measure
Returns
-------
Column
New column of measure values
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = cpp_porter_stemmer_measure(input.view())

return Column.from_libcudf(move(c_result))
47 changes: 47 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["trouble", "toy", "syzygy"]
return pa.array(arr)


@pytest.mark.parametrize("check_vowels", [True, False])
@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
def test_is_letter(input_col, check_vowels, indices):
def is_letter(s, i, check):
vowels = "aeiouy"
return (s[i] in vowels) == check

result = plc.nvtext.stemmer.is_letter(
plc.interop.from_arrow(input_col),
check_vowels,
plc.interop.from_arrow(pa.array(indices))
if isinstance(indices, list)
else indices,
)
expected = pa.array(
[
is_letter(
s,
indices[i] if isinstance(indices, list) else indices,
check_vowels,
)
for i, s in enumerate(input_col.to_pylist())
]
)
assert_column_eq(result, expected)


def test_porter_stemmer_measure(input_col):
result = plc.nvtext.stemmer.porter_stemmer_measure(
plc.interop.from_arrow(input_col),
)
expected = pa.array([1, 1, 2], type=pa.int32())
assert_column_eq(result, expected)

0 comments on commit d7cdf44

Please sign in to comment.