From 664c0e886a4ff075ea15ec08c9acf1e76a785477 Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 10:37:38 +0100 Subject: [PATCH 1/6] feat: add proportions of all pos tags --- docs/posstats.rst | 17 +++++----- .../components/pos_proportions.py | 34 +++++++++++++------ tests/test_pos_proportions.py | 33 +++++++++++++----- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/docs/posstats.rst b/docs/posstats.rst index 6824acfd..9d2ef693 100644 --- a/docs/posstats.rst +++ b/docs/posstats.rst @@ -4,11 +4,13 @@ Part-of-Speech Proportions The *pos_proportions* component adds one attribute to a Doc or Span: * :code:`Doc._.pos_proportions` - * Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. + * Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. By default creates a key for each possible POS tag. This behaviour can be turned off + by setting :code:`add_all_tags=False` in the component's initialization. + * :code:`Span._.pos_proportions` * - * Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. + * Dict of :code:`{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Usage @@ -29,12 +31,11 @@ Usage td.extract_df(doc) -==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================ - .. text pos_prop_DET pos_prop_NOUN pos_prop_AUX pos_prop_VERB pos_prop_PUNCT pos_prop_PRON pos_prop_ADP pos_prop_ADV pos_prop_SCONJ -==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================ - 0 The world is changed(...) 0.097561 0.121951 0.0731707 0.170732 0.146341 0.195122 0.0731707 0.0731707 0.0487805 -==== ========================= ============== =============== ============== =============== ================ =============== ============== ============== ================ - +==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============ + .. text pos_prop_ADJ pos_prop_ADP pos_prop_ADV pos_prop_AUX pos_prop_CCONJ pos_prop_DET pos_prop_INTJ pos_prop_NOUN pos_prop_NUM pos_prop_PART pos_prop_PRON pos_prop_PROPN pos_prop_PUNCT pos_prop_SCONJ pos_prop_SYM pos_prop_VERB pos_prop_X +==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============ + 0 The world is changed(...) 0.0243902 0.097561 0.0487805 0.0731707 0 0.097561 0 0.121951 0 0 0.195122 0 0.146341 0.0243902 0 0.170732 0 +==== ========================= ============== ============== ============== ============== ================ ============== =============== =============== ============== =============== =============== ================ ================ ================ ============== =============== ============ ----- diff --git a/src/textdescriptives/components/pos_proportions.py b/src/textdescriptives/components/pos_proportions.py index 37750f59..1186eb3c 100644 --- a/src/textdescriptives/components/pos_proportions.py +++ b/src/textdescriptives/components/pos_proportions.py @@ -10,14 +10,22 @@ class POSProportions: """spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects.""" - def __init__(self, nlp: Language, use_pos: bool): + def __init__(self, nlp: Language, use_pos: bool, add_all_tags: bool): """Initialise components. Args: use_pos: If True, uses the simple POS tag. If False, uses the detailed universal POS tag. + add_all_tags: If True, returns proportions of all possible POS tags. + If False, only returns proportions for the POS tags present in the + text. """ self.use_pos = use_pos + self.add_all_tags = add_all_tags + self.all_upos_tags = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", + "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", + "SCONJ", "SYM", "VERB", "X"] + self.all_model_tags = nlp.meta["labels"]["tagger"] if not Doc.has_extension("pos_proportions"): Doc.set_extension("pos_proportions", getter=self.pos_proportions) @@ -31,20 +39,25 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: Returns: Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with - POSTAG. Does not create a key if no tokens in the document fit the - POSTAG. + POSTAG. """ - pos_counts: Counter = Counter() + if self.add_all_tags: + if self.use_pos: + pos_counts = Counter(self.all_upos_tags) + else: + pos_counts = Counter(self.all_model_tags) + else: + pos_counts: Counter = Counter() + if self.use_pos: pos_counts.update([token.pos_ for token in text]) else: pos_counts.update([token.tag_ for token in text]) - pos_proportions = { - "pos_prop_" + tag: count / len(text) for tag, count in pos_counts.items() + return { + # subtract 1 from count to account for the instantiation of the counter + "pos_prop_" + tag: (count - 1) / len(text) for tag, count in pos_counts.items() } - return pos_proportions - def __call__(self, doc): """Run the pipeline component.""" return doc @@ -53,12 +66,13 @@ def __call__(self, doc): @Language.factory( "textdescriptives/pos_proportions", assigns=["doc._.pos_proportions", "span._.pos_proportions"], - default_config={"use_pos": True}, + default_config={"use_pos": True, "add_all_tags": True}, ) def create_pos_proportions_component( nlp: Language, name: str, use_pos: bool, + add_all_tags: bool, ) -> Callable[[Doc], Doc]: """Allows PosPropotions to be added to a spaCy pipe using nlp.add_pipe("textdescriptives/pos_proportions") @@ -94,4 +108,4 @@ def create_pos_proportions_component( + "a spaCy model which includes a 'tagger' or an 'attribute ruler' " + "component.", ) - return POSProportions(nlp, use_pos=use_pos) + return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags) diff --git a/tests/test_pos_proportions.py b/tests/test_pos_proportions.py index e8ffed71..894fa107 100644 --- a/tests/test_pos_proportions.py +++ b/tests/test_pos_proportions.py @@ -1,6 +1,7 @@ import pytest import spacy from spacy.tokens import Doc + import textdescriptives as td # noqa: F401 @@ -81,15 +82,23 @@ def test_pos_integrations(nlp): def test_pos_proportions_doc(doc): assert doc._.pos_proportions == pytest.approx( { - "pos_prop_ADV": 0.1666, + "pos_prop_ADJ": 0.1666, + "pos_prop_ADP": 0.0, "pos_prop_AUX": 0.125, + "pos_prop_ADV": 0.1666, + "pos_prop_CCONJ": 0.0416, "pos_prop_DET": 0.083, - "pos_prop_ADJ": 0.1666, + "pos_prop_INTJ": 0.0, "pos_prop_NOUN": 0.0833, - "pos_prop_PUNCT": 0.125, + "pos_prop_NUM": 0.0, + "pos_prop_PART": 0.0, "pos_prop_PRON": 0.125, + "pos_prop_PROPN": 0.0, + "pos_prop_PUNCT": 0.125, + "pos_prop_SCONJ": 0.0, + "pos_prop_SYM": 0.0, "pos_prop_VERB": 0.083, - "pos_prop_CCONJ": 0.0416, + "pos_prop_X": 0.0, }, rel=0.05, ) @@ -100,15 +109,23 @@ def test_pos_proportions_span(doc): assert span._.pos_proportions == pytest.approx( { - "pos_prop_ADV": 0.1666, + "pos_prop_ADJ": 0.1666, + "pos_prop_ADP": 0.0, "pos_prop_AUX": 0.125, + "pos_prop_ADV": 0.1666, + "pos_prop_CCONJ": 0.0416, "pos_prop_DET": 0.083, - "pos_prop_ADJ": 0.1666, + "pos_prop_INTJ": 0.0, "pos_prop_NOUN": 0.0833, - "pos_prop_PUNCT": 0.125, + "pos_prop_NUM": 0.0, + "pos_prop_PART": 0.0, "pos_prop_PRON": 0.125, + "pos_prop_PROPN": 0.0, + "pos_prop_PUNCT": 0.125, + "pos_prop_SCONJ": 0.0, + "pos_prop_SYM": 0.0, "pos_prop_VERB": 0.083, - "pos_prop_CCONJ": 0.0416, + "pos_prop_X": 0.0, }, rel=0.01, ) From e80799b3059451a48e5ac92ac7ada44e7f820aa6 Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 10:52:04 +0100 Subject: [PATCH 2/6] chore: pre-commit --- .../components/pos_proportions.py | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/textdescriptives/components/pos_proportions.py b/src/textdescriptives/components/pos_proportions.py index 1186eb3c..f4b86a02 100644 --- a/src/textdescriptives/components/pos_proportions.py +++ b/src/textdescriptives/components/pos_proportions.py @@ -16,15 +16,31 @@ def __init__(self, nlp: Language, use_pos: bool, add_all_tags: bool): Args: use_pos: If True, uses the simple POS tag. If False, uses the detailed universal POS tag. - add_all_tags: If True, returns proportions of all possible POS tags. - If False, only returns proportions for the POS tags present in the + add_all_tags: If True, returns proportions of all possible POS tags. + If False, only returns proportions for the POS tags present in the text. """ - self.use_pos = use_pos - self.add_all_tags = add_all_tags - self.all_upos_tags = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", - "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", - "SCONJ", "SYM", "VERB", "X"] + self.use_pos: bool = use_pos + self.add_all_tags: bool = add_all_tags + self.all_upos_tags = [ + "ADJ", + "ADP", + "ADV", + "AUX", + "CCONJ", + "DET", + "INTJ", + "NOUN", + "NUM", + "PART", + "PRON", + "PROPN", + "PUNCT", + "SCONJ", + "SYM", + "VERB", + "X", + ] self.all_model_tags = nlp.meta["labels"]["tagger"] if not Doc.has_extension("pos_proportions"): @@ -39,15 +55,15 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: Returns: Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with - POSTAG. + POSTAG. """ if self.add_all_tags: if self.use_pos: - pos_counts = Counter(self.all_upos_tags) + pos_counts: Counter = Counter(self.all_upos_tags) # type: ignore else: - pos_counts = Counter(self.all_model_tags) + pos_counts: Counter = Counter(self.all_model_tags) # type: ignore else: - pos_counts: Counter = Counter() + pos_counts: Counter = Counter() # type: ignore if self.use_pos: pos_counts.update([token.pos_ for token in text]) @@ -55,7 +71,8 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: pos_counts.update([token.tag_ for token in text]) return { # subtract 1 from count to account for the instantiation of the counter - "pos_prop_" + tag: (count - 1) / len(text) for tag, count in pos_counts.items() + f"pos_prop_{tag}": (count - 1) / len(text) + for tag, count in pos_counts.items() } def __call__(self, doc): From 762b7b2f454eaeec214cf5c2421dab75a8ec95c4 Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 10:52:18 +0100 Subject: [PATCH 3/6] fix: handle empty strings in pos_proportions --- src/textdescriptives/components/pos_proportions.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/textdescriptives/components/pos_proportions.py b/src/textdescriptives/components/pos_proportions.py index f4b86a02..1c1d18c4 100644 --- a/src/textdescriptives/components/pos_proportions.py +++ b/src/textdescriptives/components/pos_proportions.py @@ -2,6 +2,7 @@ from typing import Callable, Counter, Union +import numpy as np from spacy.language import Language from spacy.tokens import Doc, Span @@ -69,10 +70,11 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: pos_counts.update([token.pos_ for token in text]) else: pos_counts.update([token.tag_ for token in text]) + len_text = len(text) return { # subtract 1 from count to account for the instantiation of the counter - f"pos_prop_{tag}": (count - 1) / len(text) - for tag, count in pos_counts.items() + f"pos_prop_{tag}": (count - 1) / len(text) if len_text > 0 else np.nan + for tag, count in pos_counts.items() } def __call__(self, doc): @@ -126,3 +128,4 @@ def create_pos_proportions_component( + "component.", ) return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags) + return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags) From c703203e9f70dd6458fa2e3d4a8571c2fc81fc06 Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 10:52:44 +0100 Subject: [PATCH 4/6] chore: pre-commit --- src/textdescriptives/components/pos_proportions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/textdescriptives/components/pos_proportions.py b/src/textdescriptives/components/pos_proportions.py index 1c1d18c4..b221d146 100644 --- a/src/textdescriptives/components/pos_proportions.py +++ b/src/textdescriptives/components/pos_proportions.py @@ -74,7 +74,7 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: return { # subtract 1 from count to account for the instantiation of the counter f"pos_prop_{tag}": (count - 1) / len(text) if len_text > 0 else np.nan - for tag, count in pos_counts.items() + for tag, count in pos_counts.items() } def __call__(self, doc): From bfeaa58796ebd3ddac1ebed5a83772dfce822197 Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 10:57:41 +0100 Subject: [PATCH 5/6] chore: minor refactor --- .../components/pos_proportions.py | 29 +++---------------- src/textdescriptives/components/utils.py | 21 ++++++++++++++ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/textdescriptives/components/pos_proportions.py b/src/textdescriptives/components/pos_proportions.py index b221d146..a96bc7a1 100644 --- a/src/textdescriptives/components/pos_proportions.py +++ b/src/textdescriptives/components/pos_proportions.py @@ -6,6 +6,8 @@ from spacy.language import Language from spacy.tokens import Doc, Span +from textdescriptives.components.utils import all_upos_tags + class POSProportions: """spaCy v.3.0 component that adds attributes for POS statistics to `Doc` @@ -23,26 +25,7 @@ def __init__(self, nlp: Language, use_pos: bool, add_all_tags: bool): """ self.use_pos: bool = use_pos self.add_all_tags: bool = add_all_tags - self.all_upos_tags = [ - "ADJ", - "ADP", - "ADV", - "AUX", - "CCONJ", - "DET", - "INTJ", - "NOUN", - "NUM", - "PART", - "PRON", - "PROPN", - "PUNCT", - "SCONJ", - "SYM", - "VERB", - "X", - ] - self.all_model_tags = nlp.meta["labels"]["tagger"] + self.model_tags = all_upos_tags if use_pos else nlp.meta["labels"]["tagger"] if not Doc.has_extension("pos_proportions"): Doc.set_extension("pos_proportions", getter=self.pos_proportions) @@ -59,10 +42,7 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict: POSTAG. """ if self.add_all_tags: - if self.use_pos: - pos_counts: Counter = Counter(self.all_upos_tags) # type: ignore - else: - pos_counts: Counter = Counter(self.all_model_tags) # type: ignore + pos_counts: Counter = Counter(self.model_tags) # type: ignore else: pos_counts: Counter = Counter() # type: ignore @@ -128,4 +108,3 @@ def create_pos_proportions_component( + "component.", ) return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags) - return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags) diff --git a/src/textdescriptives/components/utils.py b/src/textdescriptives/components/utils.py index 377a4f09..92137a9b 100644 --- a/src/textdescriptives/components/utils.py +++ b/src/textdescriptives/components/utils.py @@ -35,3 +35,24 @@ def count_syl(token: Token): return max(1, word_hyphenated.count("-") + 1) return [count_syl(token) for token in filter_tokens(doc)] + + +all_upos_tags = [ + "ADJ", + "ADP", + "ADV", + "AUX", + "CCONJ", + "DET", + "INTJ", + "NOUN", + "NUM", + "PART", + "PRON", + "PROPN", + "PUNCT", + "SCONJ", + "SYM", + "VERB", + "X", +] From 4632407bb5c02cb1a602e287b8e77918390214ca Mon Sep 17 00:00:00 2001 From: Lasse Date: Wed, 26 Apr 2023 12:52:30 +0100 Subject: [PATCH 6/6] fix: Listed metrics deviate between extraction functions in docs Fixes #219 --- src/textdescriptives/extractors.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/textdescriptives/extractors.py b/src/textdescriptives/extractors.py index 8593b84d..6b08d6b3 100644 --- a/src/textdescriptives/extractors.py +++ b/src/textdescriptives/extractors.py @@ -38,10 +38,10 @@ def extract_dict( Args: docs (Union[Iterable[Doc], Doc]): An iterable of spaCy Docs or a single Doc metrics (Union[list[str], str, None], optional): Which metrics to extract. - One or more of ["descriptive_stats", "readability", - "dependency_distance", "pos_proportions", "information_theory"]. - Defaults to None in which case it will extract metrics for which a - pipeline compoenent has been set. + One or more of ["descriptive_stats", "readability", + "dependency_distance", "pos_proportions", "coherence", "quality", + "information_theory"]. Defaults to None in which case it will + extract metrics for which a pipeline compoenent has been set. include_text (bool, optional): Whether to add an entry containing the text. Defaults to True. @@ -94,10 +94,11 @@ def extract_df( Args: docs (Union[Iterable[Doc], Doc]): An iterable of spaCy Docs or a single Doc metrics (Union[list[str], str], optional): Which metrics to extract. - One or more of ["descriptive_stats", "readability", - "dependency_distance", "pos_proportions"]. Defaults to None in which - case it will extract metrics for which a pipeline compoenent has been - set. + One or more of ["descriptive_stats", "readability", + "dependency_distance", "pos_proportions", "coherence", "quality", + "information_theory"]. Defaults to None in which + case it will extract metrics for which a pipeline compoenent has been + set. include_text (bool, optional): Whether to add a column containing the text. Defaults to True. @@ -125,8 +126,9 @@ def extract_metrics( model for the language. Defaults to None. metrics (List[str]): Which metrics to extract. One or more of ["descriptive_stats", "readability", - "dependency_distance", "pos_proportions", "coherence", "quality"]. If None, - will extract all metrics from textdescriptives. Defaults to None. + "dependency_distance", "pos_proportions", "coherence", "quality", + "information_theory"]. If None, will extract all metrics from + textdescriptives. Defaults to None. spacy_model_size (str, optional): Size of the spacy model to download. Returns: