Skip to content

Commit

Permalink
Merge pull request #164 from HLasse/HLasse/Add-proportion-of-word-in-…
Browse files Browse the repository at this point in the history
…vocabuary

feat: add out of vocabulary ratio to quality component
  • Loading branch information
HLasse authored Jan 31, 2023
2 parents 10ee54f + 74c22b1 commit 835053d
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 21 deletions.
2 changes: 2 additions & 0 deletions docs/quality.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Heuristic quality metrics:
* Proportion of bullet points (:code:`proportion_bullet_points`): Proportion of lines in a documents which start with a bullet point.
* Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets.
* Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum".
* Out of vocabulary ratio (:code:`oov_ratio`): Ratio of out of vocabulary words to total words.

Repetitious text metrics:

Expand Down Expand Up @@ -90,6 +91,7 @@ If you want to specify the thresholds for the quality metrics, you can do so by
"10": (None, 0.1),
},
top_ngram_chr_fraction={"2": (None, 0.2), "3": (None, 0.18), "4": (None, 0.16)},
oov_ratio=(None, 0.2)
)
Expand Down
43 changes: 42 additions & 1 deletion src/textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Component for calculating quality metrics."""
from collections import Counter, defaultdict
from typing import Callable, Dict, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Mapping, Optional, Tuple, Union

import numpy as np
from spacy.language import Language
Expand Down Expand Up @@ -334,6 +334,23 @@ def contains_string(span: Union[Span, Doc], string: str) -> bool:
return string in span.text


def oov_ratio(span: Union[Span, Doc], vocab: Optional[Mapping] = None) -> float:
"""Calculates the out-of-vocabulary ratio.
Args:
span (Union[Span, Doc]): A spaCy Span or Doc object.
vocab (Optional[Mapping], optional): A vocabulary to check against.
If None, will use the spaCy vocab. Note that the spaCy vocab
is not defined for small models. Defaults to None.
Returns:
float: the out-of-vocabulary ratio
"""
if vocab is None:
return len([token for token in span if token.is_oov]) / len(span)
return len([token for token in span if token.text not in vocab]) / len(span)


class Quality:
"""spaCy component for adding text quality metrics to the `Doc` and `Span`
objects.
Expand All @@ -351,6 +368,7 @@ def __init__( # pylint: disable=dangerous-default-value
top_ngram_range: Tuple[int, int],
top_ngram_min_count: int,
duplicate_n_gram_fraction_range: Tuple[int, int],
vocab: Optional[Mapping],
quality_thresholds: Optional[QualityThresholds] = None,
force: bool = False,
): # noqa: D107
Expand All @@ -365,6 +383,7 @@ def __init__( # pylint: disable=dangerous-default-value
if quality_thresholds is None:
quality_thresholds = QualityThresholds()
self.quality_thresholds = quality_thresholds
self.vocab = vocab

self.set_extensions()

Expand Down Expand Up @@ -453,6 +472,20 @@ def quality_setter(
for n_gram, frac in duplicate_ngram_chr_fraction.items()
}

# add oov_ratio if spacy model is not small or has a vocab
# vector length is 0 for small models
if span.vocab.vectors_length > 0 or self.vocab:
value_oov = oov_ratio(span, self.vocab)
thresholds_oov = threshold.oov_ratio
else:
value_oov = None
thresholds_oov = (None, None)

thresholds_outputs["oov_ratio"] = ThresholdsOutput(
value=value_oov,
threshold=thresholds_oov,
)

return QualityOutput(**thresholds_outputs)

def quality_getter(self, span: Union[Span, Doc]) -> QualityOutput:
Expand Down Expand Up @@ -545,6 +578,7 @@ def __call__(self, doc: Doc):
"top_ngram_range": [2, 4],
"top_ngram_min_count": 3,
"duplicate_n_gram_fraction_range": [5, 10],
"vocab": None,
"force": True,
},
)
Expand All @@ -556,6 +590,7 @@ def create_quality_component(
top_ngram_range: Tuple[int, int],
top_ngram_min_count: int,
duplicate_n_gram_fraction_range: Tuple[int, int],
vocab: Optional[Mapping],
force: bool = True,
) -> Callable[[Doc], Doc]:
"""Allows Quality to be added to a spaCy pipe using
Expand Down Expand Up @@ -600,6 +635,11 @@ def create_quality_component(
be considered a top n-gram. Defaults to 3.
duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to
calculate the proportion of duplicate n-grams. Defaults to [5, 10].
vocab (Optional[Mapping]): vocabulary to use for calculating the
out-of-vocabulary ratio (`oov_ratio`). If None, will use the vocabulary
of the spaCy model. Note, that small spaCy models do not have a
vocabulary. The attribute will only be set if the vocabulary is not
None or the spaCy model is medium or large.
force (bool): whether to overwrite existing extensions. Defaults to True.
Expand All @@ -626,5 +666,6 @@ def create_quality_component(
top_ngram_min_count=top_ngram_min_count,
duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range,
quality_thresholds=None,
vocab=vocab,
force=force,
)
48 changes: 30 additions & 18 deletions src/textdescriptives/components/quality_data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ class Config:
extra = Extra.forbid

threshold: Union[Interval, bool, None]
value: float
value: Union[float, None]

@property
def passed(self) -> bool:
def passed(self) -> Optional[bool]:
"""Return True if the value is within the thresholds."""
if self.value is None:
return None
if self.threshold is None:
return True
if isinstance(self.threshold, bool):
Expand Down Expand Up @@ -151,6 +153,11 @@ class Config:
+ r"are contained within a duplicate for 2-grams, 18% for 3-grams and 16% "
+ "for 4-grams.",
)
oov_ratio: Interval = Field(
(None, 0.2),
description="A range for the out-of-vocabulary ratio. Default: (None, 0.2)"
+ r" i.e. no lower limit, but at most 20% of words are out-of-vocabulary.",
)


class QualityOutput(BaseModel):
Expand Down Expand Up @@ -211,29 +218,34 @@ class Config:
...,
description="The thresholds output for the top n-gram character fraction.",
)
oov_ratio: ThresholdsOutput = Field(
...,
description="The thresholds output for the out-of-vocabulary ratio.",
)

@property
def passed(self) -> bool:
"""
Returns:
bool: Whether all thresholds have been passed.
"""
return all(
[
self.n_stop_words.passed,
self.alpha_ratio.passed,
self.mean_word_length.passed,
self.doc_length.passed,
all(v.passed for v in self.symbol_to_word_ratio.values()),
self.proportion_ellipsis.passed,
self.proportion_bullet_points.passed,
all(v.passed for v in self.contains.values()),
self.duplicate_line_chr_fraction.passed,
self.duplicate_paragraph_chr_fraction.passed,
all(v.passed for v in self.duplicate_ngram_chr_fraction.values()),
all(v.passed for v in self.top_ngram_chr_fraction.values()),
],
)
passed_or_none = [
self.n_stop_words.passed,
self.alpha_ratio.passed,
self.mean_word_length.passed,
self.doc_length.passed,
all(v.passed for v in self.symbol_to_word_ratio.values()),
self.proportion_ellipsis.passed,
self.proportion_bullet_points.passed,
all(v.passed for v in self.contains.values()),
self.duplicate_line_chr_fraction.passed,
self.duplicate_paragraph_chr_fraction.passed,
all(v.passed for v in self.duplicate_ngram_chr_fraction.values()),
all(v.passed for v in self.top_ngram_chr_fraction.values()),
self.oov_ratio.passed,
]

return all(i is None or i for i in passed_or_none)

def __repr_str__(self, join_str: str) -> str:
return join_str.join(
Expand Down
5 changes: 3 additions & 2 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# only for requirements which can't be specified in the pyproject.toml file
# e.g. links to wheels which is not allowed in pyproject.toml on pypi

# spacy pipeline
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz
# spacy pipelines
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0.tar.gz
28 changes: 28 additions & 0 deletions tests/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

import pytest
import spacy

import textdescriptives as td
from textdescriptives.components.quality import (
alpha_ratio,
duplicate_ngram_fraction,
mean_word_length,
n_stop_words,
oov_ratio,
proportion_bullet_points,
proportion_ellipsis,
symbol_to_word_ratio,
Expand Down Expand Up @@ -190,6 +192,7 @@ def test_quality_component(nlp: spacy.Language):
assert quality.duplicate_ngram_chr_fraction["5"] == 1
assert abs(quality.top_ngram_chr_fraction["2"].value - 0.44) < 0.01
assert doc._.passed_quality_check is False
assert quality.oov_ratio.value is None
assert quality.passed is False


Expand All @@ -209,6 +212,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
top_ngram_chr_fraction={2: (None, 0.6), 3: (None, 0.6)},
duplicate_ngram_chr_fraction={},
contains={"lorem ipsum": False},
oov_ratio=(None, 0.3),
)

quality_pipe = nlp.add_pipe(
Expand All @@ -231,6 +235,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
assert doc._.quality.duplicate_ngram_chr_fraction["8"] == 1
assert abs(doc._.quality.top_ngram_chr_fraction["3"].value - 0.57) < 0.01
assert doc._.passed_quality_check is True
assert doc._.quality.oov_ratio.value is None


@pytest.mark.parametrize(
Expand Down Expand Up @@ -266,3 +271,26 @@ def test_quality_multi_process(nlp):
docs = nlp.pipe(texts, n_process=2)
for doc in docs:
assert doc._.quality


@pytest.mark.parametrize(
"text,expected,vocab",
[
("This is a test", 0, None),
("This is a nonwrod", 0.25, None),
("This is a test", 0, {"This", "is", "a", "test"}),
("This is a nonwrod", 0.25, {"This", "is", "a", "test"}),
],
)
def test_oov_ratio(text, expected, vocab):
"""Test the oov_ratio function."""
nlp = spacy.load("en_core_web_md")
doc = nlp(text)
assert oov_ratio(doc, vocab) == expected


def test_oov_ratio_small_model():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/quality")
doc = nlp("This is a test")
assert doc._.quality.oov_ratio.value is None

0 comments on commit 835053d

Please sign in to comment.