Skip to content

Commit

Permalink
Add new nvtext::normalize_characters API (#17818)
Browse files Browse the repository at this point in the history
Adds new normalizer APIs as part of the rework for the subword-tokenizer.
The new API is split into 2 parts. First a normalizer object is created with appropriate state: lower-case and special-tokens. The normalizing tables are currently hardcoded inside libcudf. Future versions of the this may load these tables from some other source. The 2nd API is given the input strings column and the normalizer object and returns a normalized strings column. The normalizer object can be reused on all subsequent `normalize_characters` calls.

The current `nvtext::normalize_characters` loads the normalizing tables on each call which can be significant overhead. This API will be deprecated and replaced by these 2 new ones. Some utility functions from that implementation have been refactored to be used by both until the old one is removed.

The first API creates the normalizer object.
```cpp
std::unique_ptr<character_normalizer> create_character_normalizer(
  bool do_lower_case,
  cudf::strings_column_view const& special_tokens,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

The 2nd API uses the normalizer on a strings column:
```cpp
std::unique_ptr<cudf::column> normalize_characters(
  cudf::strings_column_view const& input,
  character_normalizer const& normalizer,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Using the python interface:
```python
import cudf
from cudf.core.character_normalizer import CharacterNormalizer

cn = CharacterNormalizer(do_lower=False)
sn = cn.normalize(input_strings)

```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Karthikeyan (https://github.com/karthikeyann)
  - Matthew Murray (https://github.com/Matt711)

URL: #17818
  • Loading branch information
davidwendt authored Feb 25, 2025
1 parent e365986 commit 18a5412
Show file tree
Hide file tree
Showing 16 changed files with 1,018 additions and 154 deletions.
9 changes: 6 additions & 3 deletions cpp/benchmarks/text/normalize.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,15 +48,18 @@ static void bench_normalize(nvbench::state& state)
[&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
} else {
bool const to_lower = (normalize_type == "to_lower");
// we expect the normalizer to be created once and re-used
// so creating it is not measured
auto normalizer = nvtext::create_character_normalizer(to_lower);
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::normalize_characters(input, to_lower);
auto result = nvtext::normalize_characters(input, *normalizer);
});
}
}

NVBENCH_BENCH(bench_normalize)
.set_name("normalize")
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("max_width", {128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("type", {"spaces", "characters", "to_lower"});
14 changes: 13 additions & 1 deletion cpp/include/cudf/strings/detail/utilities.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
size_type index,
rmm::cuda_stream_view stream);

/**
* @brief Return the first and last offset in the given strings column
*
* This accounts for sliced input columns as well.
*
* @param input Strings column
* @param stream CUDA stream used for device memory operations and kernel launches
* @return First and last offset values
*/
std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream);

} // namespace strings::detail
} // namespace CUDF_EXPORT cudf
111 changes: 110 additions & 1 deletion cpp/include/nvtext/normalize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>
Expand Down Expand Up @@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Normalizer object to be used with nvtext::normalize_characters
*
* Use nvtext::create_normalizer to create this object.
*
* This normalizer includes:
*
* - adding padding around punctuation (unicode category starts with "P")
* as well as certain ASCII symbols like "^" and "$"
* - adding padding around the [CJK Unicode block
* characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
* - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
* - removing control characters (unicode categories "Cc" and "Cf")
*
* The padding process adds a single space before and after the character.
* Details on _unicode category_ can be found here:
* https://unicodebook.readthedocs.io/unicode.html#categories
*
* If `do_lower_case = true`, lower-casing also removes any accents. The
* accents cannot be removed from upper-case characters without lower-casing
* and lower-casing cannot be performed without also removing accents.
* However, if the accented character is already lower-case, then only the
* accent is removed.
*
* If `special_tokens` are included the padding after `[` and before `]` is not
* inserted if the characters between them match one of the given tokens.
* Also, the `special_tokens` are expected to include the `[]` characters
* at the beginning of and end of each string appropriately.
*/
struct character_normalizer {
/**
* @brief Normalizer object constructor
*
* This initializes and holds the character normalizing tables and settings.
*
* @param do_lower_case If true, upper-case characters are converted to
* lower-case and accents are stripped from those characters.
* If false, accented and upper-case characters are not transformed.
* @param special_tokens Each row is a token including the `[]` brackets.
* For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
character_normalizer(bool do_lower_case,
cudf::strings_column_view const& special_tokens,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
~character_normalizer();

struct character_normalizer_impl;
std::unique_ptr<character_normalizer_impl> _impl;
};

/**
* @brief Create a normalizer object
*
* Creates a normalizer object which can be reused on multiple calls to
* nvtext::normalize_characters
*
* @see nvtext::character_normalizer
*
* @param do_lower_case If true, upper-case characters are converted to
* lower-case and accents are stripped from those characters.
* If false, accented and upper-case characters are not transformed.
* @param special_tokens Individual tokens including `[]` brackets.
* Default is no special tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Object to be used with nvtext::normalize_characters
*/
std::unique_ptr<character_normalizer> create_character_normalizer(
bool do_lower_case,
cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Normalizes the text in input strings column
*
* @see nvtext::character_normalizer for details on the normalizer behavior
*
* @code{.pseudo}
* cn = create_character_normalizer(true)
* s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
* s1 = normalize_characters(s,cn)
* s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
*
* cn = create_character_normalizer(false)
* s2 = normalize_characters(s,cn)
* s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
* @endcode
*
* A null input element at row `i` produces a corresponding null entry
* for row `i` in the output column.
*
* @param input The input strings to normalize
* @param normalizer Normalizer to use for this function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return Normalized strings column
*/
std::unique_ptr<cudf::column> normalize_characters(
cudf::strings_column_view const& input,
character_normalizer const& normalizer,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
14 changes: 13 additions & 1 deletion cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
: cudf::detail::get_value<int32_t>(offsets, index, stream);
}

std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream)
{
if (input.is_empty()) { return {0L, 0L}; }
auto const first_offset = (input.offset() == 0) ? 0
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset =
cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
return {first_offset, last_offset};
}

} // namespace detail

rmm::device_uvector<string_view> create_string_vector_from_column(
Expand Down
Loading

0 comments on commit 18a5412

Please sign in to comment.