From a1177e597de75f31507895540d2e6243dccafe27 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 19 Jan 2023 12:08:31 +0100 Subject: [PATCH 01/10] feat: add out of vocabulary ratio to quality component --- src/textdescriptives/components/quality.py | 44 ++++++++++++++++++- .../components/quality_data_classes.py | 14 +++++- tests/test_quality.py | 12 +++++ 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/textdescriptives/components/quality.py b/src/textdescriptives/components/quality.py index e4860d9d..0f43e5f9 100644 --- a/src/textdescriptives/components/quality.py +++ b/src/textdescriptives/components/quality.py @@ -1,6 +1,7 @@ """Component for calculating quality metrics.""" from collections import Counter, defaultdict -from typing import Callable, Dict, List, Optional, Tuple, Union +from functools import partial +from typing import Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np from spacy.language import Language @@ -334,6 +335,23 @@ def contains_string(span: Union[Span, Doc], string: str) -> bool: return string in span.text +def oov_ratio(span: Union[Span, Doc], vocab: Optional[Mapping] = None) -> float: + """Calculates the out-of-vocabulary ratio. + + Args: + span (Union[Span, Doc]): A spaCy Span or Doc object. + vocab (Optional[Mapping], optional): A vocabulary to check against. + If None, will use the spaCy vocab. Note that the spaCy vocab + is not defined for small models. Defaults to None. + + Returns: + float: the out-of-vocabulary ratio + """ + if vocab is None: + return len([token for token in span if token.is_oov]) / len(span) + return len([token for token in span if token.text not in vocab]) / len(span) + + class Quality: """spaCy component for adding text quality metrics to the `Doc` and `Span` objects. @@ -351,6 +369,7 @@ def __init__( # pylint: disable=dangerous-default-value top_ngram_range: Tuple[int, int], top_ngram_min_count: int, duplicate_n_gram_fraction_range: Tuple[int, int], + vocab: Optional[Mapping], quality_thresholds: Optional[QualityThresholds] = None, force: bool = False, ): # noqa: D107 @@ -365,6 +384,7 @@ def __init__( # pylint: disable=dangerous-default-value if quality_thresholds is None: quality_thresholds = QualityThresholds() self.quality_thresholds = quality_thresholds + self.vocab = vocab self.set_extensions() @@ -453,6 +473,20 @@ def quality_setter( for n_gram, frac in duplicate_ngram_chr_fraction.items() } + # add oov_ratio if spacy model is not small or has a vocab + # vector length is 0 for small models + if span.vocab.vectors_length > 0 or self.vocab: + value_oov = oov_ratio(span, self.vocab) + thresholds_oov = threshold.oov_ratio + else: + value_oov = None + thresholds_oov = (None, None) + + thresholds_outputs["oov_ratio"] = ThresholdsOutput( + value=value_oov, + threshold=thresholds_oov, + ) + return QualityOutput(**thresholds_outputs) def quality_getter(self, span: Union[Span, Doc]) -> QualityOutput: @@ -545,6 +579,7 @@ def __call__(self, doc: Doc): "top_ngram_range": [2, 4], "top_ngram_min_count": 3, "duplicate_n_gram_fraction_range": [5, 10], + "vocab": None, "force": True, }, ) @@ -556,6 +591,7 @@ def create_quality_component( top_ngram_range: Tuple[int, int], top_ngram_min_count: int, duplicate_n_gram_fraction_range: Tuple[int, int], + vocab: Optional[Mapping], force: bool = True, ) -> Callable[[Doc], Doc]: """Allows Quality to be added to a spaCy pipe using @@ -600,6 +636,11 @@ def create_quality_component( be considered a top n-gram. Defaults to 3. duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. + vocab (Optional[Mapping]): vocabulary to use for calculating the + out-of-vocabulary rate. If none, will use the vocabulary of the + spaCy model. Note, that small spaCy models do not have a vocabulary. + The attribute will only be set if the vocabulary is not None or + the spaCy model is medium or large. force (bool): whether to overwrite existing extensions. Defaults to True. @@ -626,5 +667,6 @@ def create_quality_component( top_ngram_min_count=top_ngram_min_count, duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range, quality_thresholds=None, + vocab=vocab, force=force, ) diff --git a/src/textdescriptives/components/quality_data_classes.py b/src/textdescriptives/components/quality_data_classes.py index 840d203f..af33a12f 100644 --- a/src/textdescriptives/components/quality_data_classes.py +++ b/src/textdescriptives/components/quality_data_classes.py @@ -24,11 +24,13 @@ class Config: extra = Extra.forbid threshold: Union[Interval, bool, None] - value: float + value: Union[float, None] @property def passed(self) -> bool: """Return True if the value is within the thresholds.""" + if self.value is None: + return True if self.threshold is None: return True if isinstance(self.threshold, bool): @@ -151,6 +153,11 @@ class Config: + r"are contained within a duplicate for 2-grams, 18% for 3-grams and 16% " + "for 4-grams.", ) + oov_ratio: Interval = Field( + (None, 0.2), + description="A range for the out-of-vocabulary ratio. Default: (None, 0.2)" + + r" i.e. no lower limit, but at most 20% of words are out-of-vocabulary.", + ) class QualityOutput(BaseModel): @@ -211,6 +218,10 @@ class Config: ..., description="The thresholds output for the top n-gram character fraction.", ) + oov_ratio: ThresholdsOutput = Field( + ..., + description="The thresholds output for the out-of-vocabulary ratio.", + ) @property def passed(self) -> bool: @@ -232,6 +243,7 @@ def passed(self) -> bool: self.duplicate_paragraph_chr_fraction.passed, all(v.passed for v in self.duplicate_ngram_chr_fraction.values()), all(v.passed for v in self.top_ngram_chr_fraction.values()), + self.oov_ratio.passed, ], ) diff --git a/tests/test_quality.py b/tests/test_quality.py index 4778a4e8..3f88589d 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -4,12 +4,14 @@ import pytest import spacy + import textdescriptives as td from textdescriptives.components.quality import ( alpha_ratio, duplicate_ngram_fraction, mean_word_length, n_stop_words, + oov_ratio, proportion_bullet_points, proportion_ellipsis, symbol_to_word_ratio, @@ -190,6 +192,7 @@ def test_quality_component(nlp: spacy.Language): assert quality.duplicate_ngram_chr_fraction["5"] == 1 assert abs(quality.top_ngram_chr_fraction["2"].value - 0.44) < 0.01 assert doc._.passed_quality_check is False + assert quality.oov_ratio.value is None assert quality.passed is False @@ -266,3 +269,12 @@ def test_quality_multi_process(nlp): docs = nlp.pipe(texts, n_process=2) for doc in docs: assert doc._.quality + +@pytest.mark.parametrize("vocab", [None, {"This", "is", "a", "test"}]) +def test_oov_ratio(vocab): + """Test the oov_ratio function.""" + nlp = spacy.load("en_core_web_md") + doc = nlp("This is a test") + assert oov_ratio(doc, vocab) == 0 + doc = nlp("This is a nonwrod") + assert oov_ratio(doc, vocab) == 0.25 From c3dea6c213890a17967a164286b76758a5956fdf Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 16:39:54 +0100 Subject: [PATCH 02/10] chore: fixes after review --- src/textdescriptives/components/quality.py | 2 +- .../components/quality_data_classes.py | 8 +++++--- src/textdescriptives/utils.py | 12 ++++++++++++ tests/test_quality.py | 9 +++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/textdescriptives/components/quality.py b/src/textdescriptives/components/quality.py index 0f43e5f9..df979fe8 100644 --- a/src/textdescriptives/components/quality.py +++ b/src/textdescriptives/components/quality.py @@ -637,7 +637,7 @@ def create_quality_component( duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. vocab (Optional[Mapping]): vocabulary to use for calculating the - out-of-vocabulary rate. If none, will use the vocabulary of the + out-of-vocabulary ratio (`oov_ratio`). If None, will use the vocabulary of the spaCy model. Note, that small spaCy models do not have a vocabulary. The attribute will only be set if the vocabulary is not None or the spaCy model is medium or large. diff --git a/src/textdescriptives/components/quality_data_classes.py b/src/textdescriptives/components/quality_data_classes.py index af33a12f..03e96205 100644 --- a/src/textdescriptives/components/quality_data_classes.py +++ b/src/textdescriptives/components/quality_data_classes.py @@ -3,6 +3,8 @@ from pydantic import BaseModel, Extra, Field +from textdescriptives.utils import all_true_or_none + Interval = Tuple[Optional[float], Optional[float]] @@ -27,10 +29,10 @@ class Config: value: Union[float, None] @property - def passed(self) -> bool: + def passed(self) -> Optional[bool]: """Return True if the value is within the thresholds.""" if self.value is None: - return True + return None if self.threshold is None: return True if isinstance(self.threshold, bool): @@ -229,7 +231,7 @@ def passed(self) -> bool: Returns: bool: Whether all thresholds have been passed. """ - return all( + return all_true_or_none( [ self.n_stop_words.passed, self.alpha_ratio.passed, diff --git a/src/textdescriptives/utils.py b/src/textdescriptives/utils.py index 741214fd..75968962 100644 --- a/src/textdescriptives/utils.py +++ b/src/textdescriptives/utils.py @@ -169,3 +169,15 @@ def _create_spacy_pipeline( msg.info(f"No spacy model provided. Inferring spacy model for {lang}.") spacy_model = _download_spacy_model(lang=lang, size=spacy_model_size) return spacy.load(spacy_model) + + +def all_true_or_none(x: Iterable[bool]) -> bool: + """Check if all elements in an iterable are True or None. + + Args: + x (Iterable[bool]): Iterable to check + + Returns: + bool: True if all elements are True or None, False otherwise. + """ + return all([i is None or i for i in x]) \ No newline at end of file diff --git a/tests/test_quality.py b/tests/test_quality.py index 3f88589d..814cb1cf 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -212,6 +212,7 @@ def test_quality_component_with_config(nlp: spacy.Language): top_ngram_chr_fraction={2: (None, 0.6), 3: (None, 0.6)}, duplicate_ngram_chr_fraction={}, contains={"lorem ipsum": False}, + oov_ratio=(None, 0.3), ) quality_pipe = nlp.add_pipe( @@ -234,6 +235,7 @@ def test_quality_component_with_config(nlp: spacy.Language): assert doc._.quality.duplicate_ngram_chr_fraction["8"] == 1 assert abs(doc._.quality.top_ngram_chr_fraction["3"].value - 0.57) < 0.01 assert doc._.passed_quality_check is True + assert doc._.quality.oov_ratio.value is None @pytest.mark.parametrize( @@ -278,3 +280,10 @@ def test_oov_ratio(vocab): assert oov_ratio(doc, vocab) == 0 doc = nlp("This is a nonwrod") assert oov_ratio(doc, vocab) == 0.25 + + +def test_oov_ratio_small_model(): + nlp = spacy.load("en_core_web_sm") + nlp.add_pipe("textdescriptives/quality") + doc = nlp("This is a test") + assert doc._.quality.oov_ratio.value is None \ No newline at end of file From a87a5eacfa1895caa5de5c9c775b5012df1f1267 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 16:43:27 +0100 Subject: [PATCH 03/10] chore: precommit --- src/textdescriptives/components/quality.py | 9 ++++----- src/textdescriptives/utils.py | 4 ++-- tests/test_quality.py | 3 ++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/textdescriptives/components/quality.py b/src/textdescriptives/components/quality.py index df979fe8..c31b2dd2 100644 --- a/src/textdescriptives/components/quality.py +++ b/src/textdescriptives/components/quality.py @@ -1,6 +1,5 @@ """Component for calculating quality metrics.""" from collections import Counter, defaultdict -from functools import partial from typing import Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np @@ -637,10 +636,10 @@ def create_quality_component( duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. vocab (Optional[Mapping]): vocabulary to use for calculating the - out-of-vocabulary ratio (`oov_ratio`). If None, will use the vocabulary of the - spaCy model. Note, that small spaCy models do not have a vocabulary. - The attribute will only be set if the vocabulary is not None or - the spaCy model is medium or large. + out-of-vocabulary ratio (`oov_ratio`). If None, will use the vocabulary + of the spaCy model. Note, that small spaCy models do not have a + vocabulary. The attribute will only be set if the vocabulary is not + None or the spaCy model is medium or large. force (bool): whether to overwrite existing extensions. Defaults to True. diff --git a/src/textdescriptives/utils.py b/src/textdescriptives/utils.py index 75968962..28044ab1 100644 --- a/src/textdescriptives/utils.py +++ b/src/textdescriptives/utils.py @@ -171,7 +171,7 @@ def _create_spacy_pipeline( return spacy.load(spacy_model) -def all_true_or_none(x: Iterable[bool]) -> bool: +def all_true_or_none(x: Iterable[Optional[bool]]) -> bool: """Check if all elements in an iterable are True or None. Args: @@ -180,4 +180,4 @@ def all_true_or_none(x: Iterable[bool]) -> bool: Returns: bool: True if all elements are True or None, False otherwise. """ - return all([i is None or i for i in x]) \ No newline at end of file + return all([i is None or i for i in x]) diff --git a/tests/test_quality.py b/tests/test_quality.py index 814cb1cf..35d08530 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -272,6 +272,7 @@ def test_quality_multi_process(nlp): for doc in docs: assert doc._.quality + @pytest.mark.parametrize("vocab", [None, {"This", "is", "a", "test"}]) def test_oov_ratio(vocab): """Test the oov_ratio function.""" @@ -286,4 +287,4 @@ def test_oov_ratio_small_model(): nlp = spacy.load("en_core_web_sm") nlp.add_pipe("textdescriptives/quality") doc = nlp("This is a test") - assert doc._.quality.oov_ratio.value is None \ No newline at end of file + assert doc._.quality.oov_ratio.value is None From 8eb070e3887a69eb20198af464ba0b52609be751 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 16:57:18 +0100 Subject: [PATCH 04/10] tests: add spacy medium to test requirements --- tests/requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 32d74dea..6842e71a 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,6 @@ # only for requirements which can't be specified in the pyproject.toml file # e.g. links to wheels which is not allowed in pyproject.toml on pypi -# spacy pipeline -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz \ No newline at end of file +# spacy pipelines +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_md-3.2.0.tar.gz \ No newline at end of file From ea05cf3d9e511bd1e1404be568557a32f6d83062 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 17:16:14 +0100 Subject: [PATCH 05/10] chore: fix type --- tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 6842e71a..acd705ca 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,4 +3,4 @@ # spacy pipelines https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_md-3.2.0.tar.gz \ No newline at end of file +https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0.tar.gz \ No newline at end of file From 2559c97a8c90bcdce4b2eacf029b8e095fe48ba1 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 17:18:45 +0100 Subject: [PATCH 06/10] chore: update after review --- .../components/quality_data_classes.py | 34 +++++++++---------- src/textdescriptives/utils.py | 11 ------ 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/src/textdescriptives/components/quality_data_classes.py b/src/textdescriptives/components/quality_data_classes.py index 03e96205..2c565660 100644 --- a/src/textdescriptives/components/quality_data_classes.py +++ b/src/textdescriptives/components/quality_data_classes.py @@ -231,23 +231,23 @@ def passed(self) -> bool: Returns: bool: Whether all thresholds have been passed. """ - return all_true_or_none( - [ - self.n_stop_words.passed, - self.alpha_ratio.passed, - self.mean_word_length.passed, - self.doc_length.passed, - all(v.passed for v in self.symbol_to_word_ratio.values()), - self.proportion_ellipsis.passed, - self.proportion_bullet_points.passed, - all(v.passed for v in self.contains.values()), - self.duplicate_line_chr_fraction.passed, - self.duplicate_paragraph_chr_fraction.passed, - all(v.passed for v in self.duplicate_ngram_chr_fraction.values()), - all(v.passed for v in self.top_ngram_chr_fraction.values()), - self.oov_ratio.passed, - ], - ) + passed_or_none = [ + self.n_stop_words.passed, + self.alpha_ratio.passed, + self.mean_word_length.passed, + self.doc_length.passed, + all(v.passed for v in self.symbol_to_word_ratio.values()), + self.proportion_ellipsis.passed, + self.proportion_bullet_points.passed, + all(v.passed for v in self.contains.values()), + self.duplicate_line_chr_fraction.passed, + self.duplicate_paragraph_chr_fraction.passed, + all(v.passed for v in self.duplicate_ngram_chr_fraction.values()), + all(v.passed for v in self.top_ngram_chr_fraction.values()), + self.oov_ratio.passed, + ] + + return all(i is None or i for i in passed_or_none) def __repr_str__(self, join_str: str) -> str: return join_str.join( diff --git a/src/textdescriptives/utils.py b/src/textdescriptives/utils.py index 28044ab1..327e97aa 100644 --- a/src/textdescriptives/utils.py +++ b/src/textdescriptives/utils.py @@ -170,14 +170,3 @@ def _create_spacy_pipeline( spacy_model = _download_spacy_model(lang=lang, size=spacy_model_size) return spacy.load(spacy_model) - -def all_true_or_none(x: Iterable[Optional[bool]]) -> bool: - """Check if all elements in an iterable are True or None. - - Args: - x (Iterable[bool]): Iterable to check - - Returns: - bool: True if all elements are True or None, False otherwise. - """ - return all([i is None or i for i in x]) From 9c0671178f8fbdd187e909a09916591edd0e200a Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 17:23:17 +0100 Subject: [PATCH 07/10] tests: fix spacy model req for tests --- tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index acd705ca..f3f8df6b 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,4 +3,4 @@ # spacy pipelines https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0.tar.gz \ No newline at end of file +https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0.tar.gz \ No newline at end of file From 2628ec6f7694198d1c5f0128dd4d95cf92d1a499 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 23 Jan 2023 17:24:16 +0100 Subject: [PATCH 08/10] chore: precommit --- src/textdescriptives/components/quality_data_classes.py | 2 -- src/textdescriptives/utils.py | 1 - 2 files changed, 3 deletions(-) diff --git a/src/textdescriptives/components/quality_data_classes.py b/src/textdescriptives/components/quality_data_classes.py index 2c565660..f0daa8b4 100644 --- a/src/textdescriptives/components/quality_data_classes.py +++ b/src/textdescriptives/components/quality_data_classes.py @@ -3,8 +3,6 @@ from pydantic import BaseModel, Extra, Field -from textdescriptives.utils import all_true_or_none - Interval = Tuple[Optional[float], Optional[float]] diff --git a/src/textdescriptives/utils.py b/src/textdescriptives/utils.py index 327e97aa..741214fd 100644 --- a/src/textdescriptives/utils.py +++ b/src/textdescriptives/utils.py @@ -169,4 +169,3 @@ def _create_spacy_pipeline( msg.info(f"No spacy model provided. Inferring spacy model for {lang}.") spacy_model = _download_spacy_model(lang=lang, size=spacy_model_size) return spacy.load(spacy_model) - From 7338488731bf6cd9ede0b33df8b0a86c49ccbc20 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 26 Jan 2023 11:43:40 +0100 Subject: [PATCH 09/10] tests: parameterize oov ratio further --- tests/test_quality.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_quality.py b/tests/test_quality.py index 35d08530..26428321 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -273,14 +273,20 @@ def test_quality_multi_process(nlp): assert doc._.quality -@pytest.mark.parametrize("vocab", [None, {"This", "is", "a", "test"}]) -def test_oov_ratio(vocab): +@pytest.mark.parametrize( + "text,expected,vocab", + [ + ("This is a test", 0, None), + ("This is a nonwrod", 0.25, None), + ("This is a test", 0, {"This", "is", "a", "test"}), + ("This is a nonwrod", 0.25, {"This", "is", "a", "test"}), + ], +) +def test_oov_ratio(text, expected, vocab): """Test the oov_ratio function.""" nlp = spacy.load("en_core_web_md") - doc = nlp("This is a test") - assert oov_ratio(doc, vocab) == 0 - doc = nlp("This is a nonwrod") - assert oov_ratio(doc, vocab) == 0.25 + doc = nlp(text) + assert oov_ratio(doc, vocab) == expected def test_oov_ratio_small_model(): From 74c22b143fae43227feea28f74f19eff439a3240 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 26 Jan 2023 11:47:58 +0100 Subject: [PATCH 10/10] docs: update quality docs with oov_ratio --- docs/quality.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/quality.rst b/docs/quality.rst index fa2619ca..6d2d4a08 100644 --- a/docs/quality.rst +++ b/docs/quality.rst @@ -13,6 +13,7 @@ Heuristic quality metrics: * Proportion of bullet points (:code:`proportion_bullet_points`): Proportion of lines in a documents which start with a bullet point. * Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets. * Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum". +* Out of vocabulary ratio (:code:`oov_ratio`): Ratio of out of vocabulary words to total words. Repetitious text metrics: @@ -90,6 +91,7 @@ If you want to specify the thresholds for the quality metrics, you can do so by "10": (None, 0.1), }, top_ngram_chr_fraction={"2": (None, 0.2), "3": (None, 0.18), "4": (None, 0.16)}, + oov_ratio=(None, 0.2) )