From 8c09329219720bdd11a0fe958c83c33d5df818ba Mon Sep 17 00:00:00 2001 From: Philip May Date: Sat, 9 Dec 2023 23:08:08 +0100 Subject: [PATCH] Add text clean tool. (#105) * add remove_invisible_chars * add sphinx doc for text module * improve docstring * add replace_special_whitespaces * extend SPECIAL_WHITESPACES * extend SPECIAL_WHITESPACES * fix type * add tests --- docs/source/api-reference/text.rst | 6 +++ mltb2/text.py | 65 ++++++++++++++++++++++++++++++ tests/test_text.py | 50 +++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 docs/source/api-reference/text.rst create mode 100644 mltb2/text.py create mode 100644 tests/test_text.py diff --git a/docs/source/api-reference/text.rst b/docs/source/api-reference/text.rst new file mode 100644 index 0000000..4ee4981 --- /dev/null +++ b/docs/source/api-reference/text.rst @@ -0,0 +1,6 @@ +.. _text_code_doc: + +:mod:`~mltb2.text` +================== + +.. automodule:: mltb2.text diff --git a/mltb2/text.py b/mltb2/text.py new file mode 100644 index 0000000..90321b9 --- /dev/null +++ b/mltb2/text.py @@ -0,0 +1,65 @@ +# Copyright (c) 2023 Philip May +# This software is distributed under the terms of the MIT license +# which is available at https://opensource.org/licenses/MIT + +"""Text specific functionality.""" + +from typing import Dict, Final, Tuple + +INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = ( + "\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b + "\u00ad", # Soft Hyphen (SHY) https://www.compart.com/en/unicode/U+00ad + # TODO: what about: + # https://www.compart.com/en/unicode/U+2028 + # https://www.compart.com/en/unicode/U+2029 +) + +INVISIBLE_CHARACTERS_TRANS: Final[Dict[int, None]] = str.maketrans({char: None for char in INVISIBLE_CHARACTERS}) + +SPECIAL_WHITESPACES: Final[Tuple[str, ...]] = ( + # unicode block "General Punctuation": https://www.compart.com/en/unicode/block/U+2000 + "\u2000", # En Quad + "\u2001", # Em Quad + "\u2002", # En Space + "\u2003", # Em Space + "\u2004", # Three-Per-Em Space + "\u2005", # Four-Per-Em Space + "\u2006", # Six-Per-Em Space + "\u2007", # Figure Space https://www.compart.com/en/unicode/U+2007 + "\u2008", # Punctuation Space + "\u2009", # Thin Space https://www.compart.com/en/unicode/U+2009 + "\u200a", # Hair Space https://www.compart.com/en/unicode/U+200A + "\u202f", # Narrow No-Break Space (NNBSP) https://www.compart.com/en/unicode/U+202f + # other unicode blocks + "\u00a0", # No-Break Space (NBSP) https://www.compart.com/en/unicode/U+00a0 +) + +SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) + + +def remove_invisible_characters(text: str) -> str: + """Remove invisible characters from text. + + The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. + + Args: + text: The text from which the invisible characters are to be removed. + + Returns: + The cleaned text. + """ + return text.translate(INVISIBLE_CHARACTERS_TRANS) + + +def replace_special_whitespaces(text: str) -> str: + """Replace special whitespaces with normal whitespaces. + + The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. + + Args: + text: The text from which the special whitespaces are to be replaced. + + Returns: + The cleaned text. + """ + return text.translate(SPECIAL_WHITESPACES_TRANS) diff --git a/tests/test_text.py b/tests/test_text.py new file mode 100644 index 0000000..476a5cc --- /dev/null +++ b/tests/test_text.py @@ -0,0 +1,50 @@ +# Copyright (c) 2023 Philip May +# This software is distributed under the terms of the MIT license +# which is available at https://opensource.org/licenses/MIT + +import pytest + +from mltb2.text import ( + INVISIBLE_CHARACTERS, + SPECIAL_WHITESPACES, + remove_invisible_characters, + replace_special_whitespaces, +) + + +def test_remove_invisible_characters(): + text = "Hello\u200bWorld\u00ad!" + result = remove_invisible_characters(text) + assert result == "HelloWorld!" + + +def test_remove_invisible_characters_empty(): + text = "" + result = remove_invisible_characters(text) + assert result == "" + + +@pytest.mark.parametrize("char", INVISIBLE_CHARACTERS) +def test_remove_invisible_characters_single_char(char: str): + text = f">{char}<" + result = remove_invisible_characters(text) + assert result == "><" + + +def test_replace_special_whitespaces(): + text = "a\u00a0b\u2009c\u202fd\u2007e\u200af" + result = replace_special_whitespaces(text) + assert result == "a b c d e f" + + +def test_replace_special_whitespaces_empty(): + text = "" + result = replace_special_whitespaces(text) + assert result == "" + + +@pytest.mark.parametrize("char", SPECIAL_WHITESPACES) +def test_replace_special_whitespaces_single_char(char: str): + text = f">{char}<" + result = replace_special_whitespaces(text) + assert result == "> <"