Skip to content

Commit

Permalink
[SPARKNLP-1109] Adding Cleaner annotator
Browse files Browse the repository at this point in the history
  • Loading branch information
danilojsl committed Feb 6, 2025
1 parent bbd5d05 commit 39de6c6
Show file tree
Hide file tree
Showing 9 changed files with 1,553 additions and 55 deletions.
3 changes: 2 additions & 1 deletion python/sparknlp/annotator/cleaners/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sparknlp.annotator.cleaners.extractor import *
from sparknlp.annotator.cleaners.extractor import *
from sparknlp.annotator.cleaners.cleaner import *
201 changes: 201 additions & 0 deletions python/sparknlp/annotator/cleaners/cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for Cleaner."""
from sparknlp.common import *

class Cleaner(AnnotatorModel):
name = "Cleaner"

inputAnnotatorTypes = [AnnotatorType.TOKEN]

outputAnnotatorType = AnnotatorType.CHUNK

encoding = Param(Params._dummy(),
"encoding",
"The encoding to be used for decoding the byte string (default is utf-8)",
typeConverter=TypeConverters.toString)

cleanPrefixPattern = Param(Params._dummy(),
"cleanPrefixPattern",
"The pattern for the prefix. Can be a simple string or a regex pattern.",
typeConverter=TypeConverters.toString)

cleanPostfixPattern = Param(Params._dummy(),
"cleanPostfixPattern",
"The pattern for the postfix. Can be a simple string or a regex pattern.",
typeConverter=TypeConverters.toString)

cleanerMode = Param(
Params._dummy(),
"cleanerMode",
"possible values: " +
"clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes",
typeConverter=TypeConverters.toString
)

extraWhitespace = Param(Params._dummy(),
"extraWhitespace",
"Whether to remove extra whitespace.",
typeConverter=TypeConverters.toBoolean)

dashes = Param(Params._dummy(),
"dashes",
"Whether to handle dashes in text.",
typeConverter=TypeConverters.toBoolean)

bullets = Param(Params._dummy(),
"bullets",
"Whether to handle bullets in text.",
typeConverter=TypeConverters.toBoolean)

trailingPunctuation = Param(Params._dummy(),
"trailingPunctuation",
"Whether to remove trailing punctuation from text.",
typeConverter=TypeConverters.toBoolean)

lowercase = Param(Params._dummy(),
"lowercase",
"Whether to convert text to lowercase.",
typeConverter=TypeConverters.toBoolean)

ignoreCase = Param(Params._dummy(),
"ignoreCase",
"If true, ignores case in the pattern.",
typeConverter=TypeConverters.toBoolean)

strip = Param(Params._dummy(),
"strip",
"If true, removes leading or trailing whitespace from the cleaned string.",
typeConverter=TypeConverters.toBoolean)

def setEncoding(self, value):
"""Sets the encoding to be used for decoding the byte string (default is utf-8).
Parameters
----------
value : str
The encoding to be used for decoding the byte string (default is utf-8)
"""
return self._set(encoding=value)

def setCleanPrefixPattern(self, value):
"""Sets the pattern for the prefix. Can be a simple string or a regex pattern.
Parameters
----------
value : str
The pattern for the prefix. Can be a simple string or a regex pattern.
"""
return self._set(cleanPrefixPattern=value)

def setCleanPostfixPattern(self, value):
"""Sets the pattern for the postfix. Can be a simple string or a regex pattern.
Parameters
----------
value : str
The pattern for the postfix. Can be a simple string or a regex pattern.
"""
return self._set(cleanPostfixPattern=value)

def setCleanerMode(self, value):
"""Sets the cleaner mode.
Possible values:
clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets,
clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes
Parameters
----------
value : str
The mode for cleaning operations.
"""
return self._set(cleanerMode=value)

def setExtraWhitespace(self, value):
"""Sets whether to remove extra whitespace.
Parameters
----------
value : bool
Whether to remove extra whitespace.
"""
return self._set(extraWhitespace=value)

def setDashes(self, value):
"""Sets whether to handle dashes in text.
Parameters
----------
value : bool
Whether to handle dashes in text.
"""
return self._set(dashes=value)

def setBullets(self, value):
"""Sets whether to handle bullets in text.
Parameters
----------
value : bool
Whether to handle bullets in text.
"""
return self._set(bullets=value)

def setTrailingPunctuation(self, value):
"""Sets whether to remove trailing punctuation from text.
Parameters
----------
value : bool
Whether to remove trailing punctuation from text.
"""
return self._set(trailingPunctuation=value)

def setLowercase(self, value):
"""Sets whether to convert text to lowercase.
Parameters
----------
value : bool
Whether to convert text to lowercase.
"""
return self._set(lowercase=value)

def setIgnoreCase(self, value):
"""Sets whether to ignore case in the pattern.
Parameters
----------
value : bool
If true, ignores case in the pattern.
"""
return self._set(ignoreCase=value)

def setStrip(self, value):
"""Sets whether to remove leading or trailing whitespace from the cleaned string.
Parameters
----------
value : bool
If true, removes leading or trailing whitespace from the cleaned string.
"""
return self._set(strip=value)

@keyword_only
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Cleaner", java_model=None):
super(Cleaner, self).__init__(
classname=classname,
java_model=java_model
)
47 changes: 47 additions & 0 deletions python/test/annotator/cleaners/cleaner_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

import pytest

from sparknlp.annotator.cleaners import *
from sparknlp.base import *
from test.util import SparkContextForTest


@pytest.mark.slow
class CleanerTestSpec(unittest.TestCase):

def setUp(self):
self.spark = SparkContextForTest.spark
eml_data = """Hello ð\x9f\x98\x80"""
self.data_set = self.spark.createDataFrame([[eml_data]]).toDF("text")

def runTest(self):
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

extractor = Cleaner() \
.setInputCols(["document"]) \
.setOutputCol("cleaned") \
.setCleanerMode("bytes_string_to_string")

pipeline = Pipeline().setStages([
document_assembler,
extractor
])

model = pipeline.fit(self.data_set)
result = model.transform(self.data_set)
result.show(truncate=False)

Loading

0 comments on commit 39de6c6

Please sign in to comment.