Skip to content

Commit

Permalink
Merge pull request #223 from HLasse/HLasse/Listed-metrics-deviate-bet…
Browse files Browse the repository at this point in the history
…ween-extraction-functions-in-docs

HLasse/Listed-metrics-deviate-between-extraction-functions-in-docs
  • Loading branch information
HLasse authored Apr 26, 2023
2 parents d8676ba + 4632407 commit 84ce853
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 36 deletions.
17 changes: 9 additions & 8 deletions docs/posstats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ Part-of-Speech Proportions
The *pos_proportions* component adds one attribute to a Doc or Span:

* :code:`Doc._.pos_proportions`
* Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG.
* Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. By default creates a key for each possible POS tag. This behaviour can be turned off
by setting :code:`add_all_tags=False` in the component's initialization.


* :code:`Span._.pos_proportions`
*
* Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG.
* Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`.


Usage
Expand All @@ -29,12 +31,11 @@ Usage
td.extract_df(doc)
==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================
.. text pos_prop_DET pos_prop_NOUN pos_prop_AUX pos_prop_VERB pos_prop_PUNCT pos_prop_PRON pos_prop_ADP pos_prop_ADV pos_prop_SCONJ
==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================
0 The world is changed(...) 0.097561 0.121951 0.0731707 0.170732 0.146341 0.195122 0.0731707 0.0731707 0.0487805
==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================

==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============
.. text pos_prop_ADJ pos_prop_ADP pos_prop_ADV pos_prop_AUX pos_prop_CCONJ pos_prop_DET pos_prop_INTJ pos_prop_NOUN pos_prop_NUM pos_prop_PART pos_prop_PRON pos_prop_PROPN pos_prop_PUNCT pos_prop_SCONJ pos_prop_SYM pos_prop_VERB pos_prop_X
==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============
0 The world is changed(...) 0.0243902 0.097561 0.0487805 0.0731707 0 0.097561 0 0.121951 0 0 0.195122 0 0.146341 0.0243902 0 0.170732 0
==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============
-----


Expand Down
33 changes: 23 additions & 10 deletions src/textdescriptives/components/pos_proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,30 @@

from typing import Callable, Counter, Union

import numpy as np
from spacy.language import Language
from spacy.tokens import Doc, Span

from textdescriptives.components.utils import all_upos_tags


class POSProportions:
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc`
and `Span` objects."""

def __init__(self, nlp: Language, use_pos: bool):
def __init__(self, nlp: Language, use_pos: bool, add_all_tags: bool):
"""Initialise components.
Args:
use_pos: If True, uses the simple POS tag. If False, uses the detailed
universal POS tag.
add_all_tags: If True, returns proportions of all possible POS tags.
If False, only returns proportions for the POS tags present in the
text.
"""
self.use_pos = use_pos
self.use_pos: bool = use_pos
self.add_all_tags: bool = add_all_tags
self.model_tags = all_upos_tags if use_pos else nlp.meta["labels"]["tagger"]

if not Doc.has_extension("pos_proportions"):
Doc.set_extension("pos_proportions", getter=self.pos_proportions)
Expand All @@ -31,20 +39,24 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict:
Returns:
Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with
POSTAG. Does not create a key if no tokens in the document fit the
POSTAG.
"""
pos_counts: Counter = Counter()
if self.add_all_tags:
pos_counts: Counter = Counter(self.model_tags) # type: ignore
else:
pos_counts: Counter = Counter() # type: ignore

if self.use_pos:
pos_counts.update([token.pos_ for token in text])
else:
pos_counts.update([token.tag_ for token in text])
pos_proportions = {
"pos_prop_" + tag: count / len(text) for tag, count in pos_counts.items()
len_text = len(text)
return {
# subtract 1 from count to account for the instantiation of the counter
f"pos_prop_{tag}": (count - 1) / len(text) if len_text > 0 else np.nan
for tag, count in pos_counts.items()
}

return pos_proportions

def __call__(self, doc):
"""Run the pipeline component."""
return doc
Expand All @@ -53,12 +65,13 @@ def __call__(self, doc):
@Language.factory(
"textdescriptives/pos_proportions",
assigns=["doc._.pos_proportions", "span._.pos_proportions"],
default_config={"use_pos": True},
default_config={"use_pos": True, "add_all_tags": True},
)
def create_pos_proportions_component(
nlp: Language,
name: str,
use_pos: bool,
add_all_tags: bool,
) -> Callable[[Doc], Doc]:
"""Allows PosPropotions to be added to a spaCy pipe using
nlp.add_pipe("textdescriptives/pos_proportions")
Expand Down Expand Up @@ -94,4 +107,4 @@ def create_pos_proportions_component(
+ "a spaCy model which includes a 'tagger' or an 'attribute ruler' "
+ "component.",
)
return POSProportions(nlp, use_pos=use_pos)
return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags)
21 changes: 21 additions & 0 deletions src/textdescriptives/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,24 @@ def count_syl(token: Token):
return max(1, word_hyphenated.count("-") + 1)

return [count_syl(token) for token in filter_tokens(doc)]


all_upos_tags = [
"ADJ",
"ADP",
"ADV",
"AUX",
"CCONJ",
"DET",
"INTJ",
"NOUN",
"NUM",
"PART",
"PRON",
"PROPN",
"PUNCT",
"SCONJ",
"SYM",
"VERB",
"X",
]
22 changes: 12 additions & 10 deletions src/textdescriptives/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def extract_dict(
Args:
docs (Union[Iterable[Doc], Doc]): An iterable of spaCy Docs or a single Doc
metrics (Union[list[str], str, None], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability",
"dependency_distance", "pos_proportions", "information_theory"].
Defaults to None in which case it will extract metrics for which a
pipeline compoenent has been set.
One or more of ["descriptive_stats", "readability",
"dependency_distance", "pos_proportions", "coherence", "quality",
"information_theory"]. Defaults to None in which case it will
extract metrics for which a pipeline compoenent has been set.
include_text (bool, optional): Whether to add an entry containing the text.
Defaults to True.
Expand Down Expand Up @@ -94,10 +94,11 @@ def extract_df(
Args:
docs (Union[Iterable[Doc], Doc]): An iterable of spaCy Docs or a single Doc
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability",
"dependency_distance", "pos_proportions"]. Defaults to None in which
case it will extract metrics for which a pipeline compoenent has been
set.
One or more of ["descriptive_stats", "readability",
"dependency_distance", "pos_proportions", "coherence", "quality",
"information_theory"]. Defaults to None in which
case it will extract metrics for which a pipeline compoenent has been
set.
include_text (bool, optional): Whether to add a column containing the text.
Defaults to True.
Expand Down Expand Up @@ -125,8 +126,9 @@ def extract_metrics(
model for the language. Defaults to None.
metrics (List[str]): Which metrics to extract.
One or more of ["descriptive_stats", "readability",
"dependency_distance", "pos_proportions", "coherence", "quality"]. If None,
will extract all metrics from textdescriptives. Defaults to None.
"dependency_distance", "pos_proportions", "coherence", "quality",
"information_theory"]. If None, will extract all metrics from
textdescriptives. Defaults to None.
spacy_model_size (str, optional): Size of the spacy model to download.
Returns:
Expand Down
33 changes: 25 additions & 8 deletions tests/test_pos_proportions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import spacy
from spacy.tokens import Doc

import textdescriptives as td # noqa: F401


Expand Down Expand Up @@ -81,15 +82,23 @@ def test_pos_integrations(nlp):
def test_pos_proportions_doc(doc):
assert doc._.pos_proportions == pytest.approx(
{
"pos_prop_ADV": 0.1666,
"pos_prop_ADJ": 0.1666,
"pos_prop_ADP": 0.0,
"pos_prop_AUX": 0.125,
"pos_prop_ADV": 0.1666,
"pos_prop_CCONJ": 0.0416,
"pos_prop_DET": 0.083,
"pos_prop_ADJ": 0.1666,
"pos_prop_INTJ": 0.0,
"pos_prop_NOUN": 0.0833,
"pos_prop_PUNCT": 0.125,
"pos_prop_NUM": 0.0,
"pos_prop_PART": 0.0,
"pos_prop_PRON": 0.125,
"pos_prop_PROPN": 0.0,
"pos_prop_PUNCT": 0.125,
"pos_prop_SCONJ": 0.0,
"pos_prop_SYM": 0.0,
"pos_prop_VERB": 0.083,
"pos_prop_CCONJ": 0.0416,
"pos_prop_X": 0.0,
},
rel=0.05,
)
Expand All @@ -100,15 +109,23 @@ def test_pos_proportions_span(doc):

assert span._.pos_proportions == pytest.approx(
{
"pos_prop_ADV": 0.1666,
"pos_prop_ADJ": 0.1666,
"pos_prop_ADP": 0.0,
"pos_prop_AUX": 0.125,
"pos_prop_ADV": 0.1666,
"pos_prop_CCONJ": 0.0416,
"pos_prop_DET": 0.083,
"pos_prop_ADJ": 0.1666,
"pos_prop_INTJ": 0.0,
"pos_prop_NOUN": 0.0833,
"pos_prop_PUNCT": 0.125,
"pos_prop_NUM": 0.0,
"pos_prop_PART": 0.0,
"pos_prop_PRON": 0.125,
"pos_prop_PROPN": 0.0,
"pos_prop_PUNCT": 0.125,
"pos_prop_SCONJ": 0.0,
"pos_prop_SYM": 0.0,
"pos_prop_VERB": 0.083,
"pos_prop_CCONJ": 0.0416,
"pos_prop_X": 0.0,
},
rel=0.01,
)

0 comments on commit 84ce853

Please sign in to comment.