-
Notifications
You must be signed in to change notification settings - Fork 720
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARKNLP-1109] Adding Cleaner annotator
- Loading branch information
Showing
9 changed files
with
1,553 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
# Copyright 2017-2025 John Snow Labs | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Contains classes for Cleaner.""" | ||
from sparknlp.common import * | ||
|
||
class Cleaner(AnnotatorModel): | ||
name = "Cleaner" | ||
|
||
inputAnnotatorTypes = [AnnotatorType.TOKEN] | ||
|
||
outputAnnotatorType = AnnotatorType.CHUNK | ||
|
||
encoding = Param(Params._dummy(), | ||
"encoding", | ||
"The encoding to be used for decoding the byte string (default is utf-8)", | ||
typeConverter=TypeConverters.toString) | ||
|
||
cleanPrefixPattern = Param(Params._dummy(), | ||
"cleanPrefixPattern", | ||
"The pattern for the prefix. Can be a simple string or a regex pattern.", | ||
typeConverter=TypeConverters.toString) | ||
|
||
cleanPostfixPattern = Param(Params._dummy(), | ||
"cleanPostfixPattern", | ||
"The pattern for the postfix. Can be a simple string or a regex pattern.", | ||
typeConverter=TypeConverters.toString) | ||
|
||
cleanerMode = Param( | ||
Params._dummy(), | ||
"cleanerMode", | ||
"possible values: " + | ||
"clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes", | ||
typeConverter=TypeConverters.toString | ||
) | ||
|
||
extraWhitespace = Param(Params._dummy(), | ||
"extraWhitespace", | ||
"Whether to remove extra whitespace.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
dashes = Param(Params._dummy(), | ||
"dashes", | ||
"Whether to handle dashes in text.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
bullets = Param(Params._dummy(), | ||
"bullets", | ||
"Whether to handle bullets in text.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
trailingPunctuation = Param(Params._dummy(), | ||
"trailingPunctuation", | ||
"Whether to remove trailing punctuation from text.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
lowercase = Param(Params._dummy(), | ||
"lowercase", | ||
"Whether to convert text to lowercase.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
ignoreCase = Param(Params._dummy(), | ||
"ignoreCase", | ||
"If true, ignores case in the pattern.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
strip = Param(Params._dummy(), | ||
"strip", | ||
"If true, removes leading or trailing whitespace from the cleaned string.", | ||
typeConverter=TypeConverters.toBoolean) | ||
|
||
def setEncoding(self, value): | ||
"""Sets the encoding to be used for decoding the byte string (default is utf-8). | ||
Parameters | ||
---------- | ||
value : str | ||
The encoding to be used for decoding the byte string (default is utf-8) | ||
""" | ||
return self._set(encoding=value) | ||
|
||
def setCleanPrefixPattern(self, value): | ||
"""Sets the pattern for the prefix. Can be a simple string or a regex pattern. | ||
Parameters | ||
---------- | ||
value : str | ||
The pattern for the prefix. Can be a simple string or a regex pattern. | ||
""" | ||
return self._set(cleanPrefixPattern=value) | ||
|
||
def setCleanPostfixPattern(self, value): | ||
"""Sets the pattern for the postfix. Can be a simple string or a regex pattern. | ||
Parameters | ||
---------- | ||
value : str | ||
The pattern for the postfix. Can be a simple string or a regex pattern. | ||
""" | ||
return self._set(cleanPostfixPattern=value) | ||
|
||
def setCleanerMode(self, value): | ||
"""Sets the cleaner mode. | ||
Possible values: | ||
clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, | ||
clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes | ||
Parameters | ||
---------- | ||
value : str | ||
The mode for cleaning operations. | ||
""" | ||
return self._set(cleanerMode=value) | ||
|
||
def setExtraWhitespace(self, value): | ||
"""Sets whether to remove extra whitespace. | ||
Parameters | ||
---------- | ||
value : bool | ||
Whether to remove extra whitespace. | ||
""" | ||
return self._set(extraWhitespace=value) | ||
|
||
def setDashes(self, value): | ||
"""Sets whether to handle dashes in text. | ||
Parameters | ||
---------- | ||
value : bool | ||
Whether to handle dashes in text. | ||
""" | ||
return self._set(dashes=value) | ||
|
||
def setBullets(self, value): | ||
"""Sets whether to handle bullets in text. | ||
Parameters | ||
---------- | ||
value : bool | ||
Whether to handle bullets in text. | ||
""" | ||
return self._set(bullets=value) | ||
|
||
def setTrailingPunctuation(self, value): | ||
"""Sets whether to remove trailing punctuation from text. | ||
Parameters | ||
---------- | ||
value : bool | ||
Whether to remove trailing punctuation from text. | ||
""" | ||
return self._set(trailingPunctuation=value) | ||
|
||
def setLowercase(self, value): | ||
"""Sets whether to convert text to lowercase. | ||
Parameters | ||
---------- | ||
value : bool | ||
Whether to convert text to lowercase. | ||
""" | ||
return self._set(lowercase=value) | ||
|
||
def setIgnoreCase(self, value): | ||
"""Sets whether to ignore case in the pattern. | ||
Parameters | ||
---------- | ||
value : bool | ||
If true, ignores case in the pattern. | ||
""" | ||
return self._set(ignoreCase=value) | ||
|
||
def setStrip(self, value): | ||
"""Sets whether to remove leading or trailing whitespace from the cleaned string. | ||
Parameters | ||
---------- | ||
value : bool | ||
If true, removes leading or trailing whitespace from the cleaned string. | ||
""" | ||
return self._set(strip=value) | ||
|
||
@keyword_only | ||
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Cleaner", java_model=None): | ||
super(Cleaner, self).__init__( | ||
classname=classname, | ||
java_model=java_model | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Copyright 2017-2025 John Snow Labs | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import unittest | ||
|
||
import pytest | ||
|
||
from sparknlp.annotator.cleaners import * | ||
from sparknlp.base import * | ||
from test.util import SparkContextForTest | ||
|
||
|
||
@pytest.mark.slow | ||
class CleanerTestSpec(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.spark = SparkContextForTest.spark | ||
eml_data = """Hello ð\x9f\x98\x80""" | ||
self.data_set = self.spark.createDataFrame([[eml_data]]).toDF("text") | ||
|
||
def runTest(self): | ||
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document") | ||
|
||
extractor = Cleaner() \ | ||
.setInputCols(["document"]) \ | ||
.setOutputCol("cleaned") \ | ||
.setCleanerMode("bytes_string_to_string") | ||
|
||
pipeline = Pipeline().setStages([ | ||
document_assembler, | ||
extractor | ||
]) | ||
|
||
model = pipeline.fit(self.data_set) | ||
result = model.transform(self.data_set) | ||
result.show(truncate=False) | ||
|
Oops, something went wrong.