diff --git a/docs/quality.rst b/docs/quality.rst index 77fa0c09..fa2619ca 100644 --- a/docs/quality.rst +++ b/docs/quality.rst @@ -71,28 +71,30 @@ If you want to specify the thresholds for the quality metrics, you can do so by # set thresholds for quality metrics (these are just the default) thresholds = QualityThresholds( - n_stop_words=(2, None), - alpha_ratio=(0.8, None), - mean_word_length=(3, 10), - doc_length= (10, 100_000), - symbol_hashtag_to_word_ratio=(None, 0.1), + n_stop_words=(2, None), # at least 2 stop words, no upper bound + alpha_ratio=(0.7, None), + mean_word_length=(3, 10), # mean word length between 3 and 10 characters + doc_length=(10, 100000), + symbol_to_word_ratio={"#": (None, 0.1)}, proportion_ellipsis=(None, 0.3), proportion_bullet_points=(None, 0.8), + contains={"lorem ipsum": False}, duplicate_line_chr_fraction=(None, 0.2), duplicate_paragraph_chr_fraction=(None, 0.2), - duplicate_5gram_chr_fraction=(None, 0.15), - duplicate_6gram_chr_fraction=(None, 0.14), - duplicate_7gram_chr_fraction=(None, 0.13), - duplicate_8gram_chr_fraction=(None, 0.12), - duplicate_9gram_chr_fraction=(None, 0.11), - duplicate_10gram_chr_fraction=(None, 0.1), - top_2gram_chr_fraction=(None, 0.20), - top_3gram_chr_fraction=(None, 0.18), - top_4gram_chr_fraction=(None, 0.16), - contains_lorem_ipsum=False + duplicate_ngram_chr_fraction={ + "5": (None, 0.15), + "6": (None, 0.14), + "7": (None, 0.13), + "8": (None, 0.12), + "9": (None, 0.11), + "10": (None, 0.1), + }, + top_ngram_chr_fraction={"2": (None, 0.2), "3": (None, 0.18), "4": (None, 0.16)}, ) - nlp.add_pipe("textdescriptives.quality", config={"quality_thresholds": thresholds.dict()}) + + quality_pipe = nlp.add_pipe("textdescriptives.quality") + quality_pipe.set_quality_thresholds(thresholds) # update the quality thresholds doc = nlp("The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.") # all attributes are stored as a dict in the ._.quality attribute @@ -112,5 +114,6 @@ Component Data Classes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autopydantic_model:: textdescriptives.components.quality.QualityThresholds - +.. autopydantic_model:: textdescriptives.components.quality_data_classes.QualityThresholds +.. autopydantic_model:: textdescriptives.components.quality_data_classes.QualityOutput +.. autopydantic_model:: textdescriptives.components.quality_data_classes.ThresholdsOutput diff --git a/docs/tutorial.rst b/docs/tutorial.rst index afada036..333c209e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -10,4 +10,5 @@ locally. :caption: Tutorials tutorials/introductory_tutorial.ipynb + tutorials/filter_corpus_using_quality.ipynb diff --git a/docs/tutorials/filter_corpus_using_quality.ipynb b/docs/tutorials/filter_corpus_using_quality.ipynb new file mode 100644 index 00000000..8ed71383 --- /dev/null +++ b/docs/tutorials/filter_corpus_using_quality.ipynb @@ -0,0 +1,1197 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filtering corpora using Quality\n", + "\n", + "\n", + " \"Open\n", + "\n", + "\n", + "In many cases if you want to analyse tweets, train a model on text scraped from the web or similar, it is important to filter out low-quality texts.\n", + "\n", + "TextDescriptives implements a series of heuristic filters for removing low-quality text. This tutorial will take you through how to use these to filter\n", + "your text corpora." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "For this we will use datasets available on [Huggingface Datasets](https://huggingface.co/datasets). Thus we will need the `datasets` package. Which you can install by running\n", + "\n", + "```python\n", + "!pip install datasets\n", + "```\n", + "\n", + "Or by installing textdescriptives with the `[tutorials]` option as below" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import textdescriptives as td\n", + "except:\n", + " !pip install \"textdescriptives[tutorials]\"\n", + " import textdescriptives as td" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering Web content\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### The Data\n", + "For our first example we will filter web content. For this we will use the [mC4 dataset](https://huggingface.co/datasets/mc4). It would take ages to download the whole data so instead we will stream down 1000 samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "# stream in the dataset\n", + "dataset = load_dataset(\"mc4\", \"en\", streaming = True, split = \"train\")\n", + "\n", + "# download the first 1 000\n", + "dataset = dataset.take(1000)\n", + "\n", + "# extract the text\n", + "texts = [sample [\"text\"] for sample in dataset]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Posts 4,362\tMore Info\n", + "Okay so to those of you that were very helpful this is not to you but for those of you that laugh when I ask about ohms or powering LSi15's this is to you. If you know a book, website, or someone to talk to to get more info that I seek so I know what some of you are talking about please share it with me. I ask questions to gain more info on audio thats all. Not to get laughed\n" + ] + } + ], + "source": [ + "# let us look at the first part (400 characters) of the first text\n", + "print(texts[0][:400])\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering\n", + "\n", + "To filter texts using `textdescriptives` we need to first set up the pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "\n", + "# create the spacy nlp pipeline\n", + "nlp = spacy.blank(\"en\")\n", + "# add a component for sentence segmentation\n", + "nlp.add_pipe(\"sentencizer\")\n", + "# add a component for quality filtering\n", + "quality_pipe = nlp.add_pipe(\"textdescriptives/quality\")\n", + "\n", + "# apply the pipeline to the texts\n", + "docs = nlp.pipe(texts)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will note here that docs is a generator. This can be quite useful (especially when streaming texts in one at a time), but for this example we can simply convert it to a list:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docs is type \n", + "docs is type \n" + ] + } + ], + "source": [ + "print(f\"docs is type {type(docs)}\")\n", + "docs = list(docs)\n", + "print(f\"docs is type {type(docs)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it is easy to examine the documents using the `doc._.quality` or `doc._.passed_quality_check` extensions:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Posts 4,362\tMore Info\n", + "Okay so to those of you that were very helpful this is not to you but for those of you that laugh when I ask about ohms or powering LSi15's this is to you. If you know a book, website, or someone to talk to to get more info that I seek so I know what some of you are talking about please share it with me. I ask questions to gain more info on audio thats all. Not to get laughed at when asking it.\n" + ] + } + ], + "source": [ + "# examine the first document\n", + "doc = docs[0]\n", + "print(doc[:100])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc._.passed_quality_check" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It seems like this document did no pass the quality check. Let us examine why that is:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "QualityOutput(\n", + "\tpassed=False, \n", + "\tn_stop_words=ThresholdsOutput(value=435.0, passed=True, threshold=(2.0, None)), \n", + "\talpha_ratio=ThresholdsOutput(value=0.79, passed=True, threshold=(0.7, None)), \n", + "\tmean_word_length=ThresholdsOutput(value=3.52, passed=True, threshold=(3.0, 10.0)), \n", + "\tdoc_length=ThresholdsOutput(value=894.0, passed=True, threshold=(10.0, 100000.0)), \n", + "\tsymbol_to_word_ratio={'#': ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.1))}, \n", + "\tproportion_ellipsis=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.3)), \n", + "\tproportion_bullet_points=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.8)), \n", + "\tcontains={'lorem ipsum': ThresholdsOutput(value=0.0, passed=True, threshold=False)}, \n", + "\tduplicate_line_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), \n", + "\tduplicate_paragraph_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), \n", + "\tduplicate_ngram_chr_fraction={'5': ThresholdsOutput(value=0.42, passed=False, threshold=(None, 0.15)), '6': ThresholdsOutput(value=0.42, passed=False, threshold=(None, 0.14)), '7': ThresholdsOutput(value=0.38, passed=False, threshold=(None, 0.13)), '8': ThresholdsOutput(value=0.36, passed=False, threshold=(None, 0.12)), '9': ThresholdsOutput(value=0.36, passed=False, threshold=(None, 0.11)), '10': ThresholdsOutput(value=0.36, passed=False, threshold=(None, 0.1))}, \n", + "\ttop_ngram_chr_fraction={'2': ThresholdsOutput(value=0.01, passed=True, threshold=(None, 0.2)), '3': ThresholdsOutput(value=0.01, passed=True, threshold=(None, 0.18)), '4': ThresholdsOutput(value=0.01, passed=True, threshold=(None, 0.16))})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc._.quality" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Naturally, you might not know what all of these mean, but you can easily check it on [the documentation site](https://hlasse.github.io/TextDescriptives/quality.html). Examining these we see that this text has a high proportion of characters which appear in duplicate n-grams `duplicate_10-gram_chr_fraction`. When this fraction is really high it means that the text contains a high proportion of repititions. This is often a sign of low quality text.\n", + "\n", + "If we examine the quality thresholds of the pipeline we can see that the max allowed value for `duplicate_10-gram_chr_fraction` is 0.1:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n_stop_words=(2, None) alpha_ratio=(0.7, None) mean_word_length=(3, 10) doc_length=(10, 100000) symbol_to_word_ratio={'#': (None, 0.1)} proportion_ellipsis=(None, 0.3) proportion_bullet_points=(None, 0.8) contains={'lorem ipsum': False} duplicate_line_chr_fraction=(None, 0.2) duplicate_paragraph_chr_fraction=(None, 0.2) duplicate_ngram_chr_fraction={'5': (None, 0.15), '6': (None, 0.14), '7': (None, 0.13), '8': (None, 0.12), '9': (None, 0.11), '10': (None, 0.1)} top_ngram_chr_fraction={'2': (None, 0.2), '3': (None, 0.18), '4': (None, 0.16)}\n", + "---\n", + "The thresholds for Duplicate n-grams:\n", + "{'5': (None, 0.15), '6': (None, 0.14), '7': (None, 0.13), '8': (None, 0.12), '9': (None, 0.11), '10': (None, 0.1)}\n" + ] + } + ], + "source": [ + "print(quality_pipe.quality_thresholds)\n", + "\n", + "print(\"---\")\n", + "print(\"The thresholds for Duplicate n-grams:\")\n", + "print(quality_pipe.quality_thresholds.duplicate_ngram_chr_fraction)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extracting high quality texts\n", + "We are typically interested in text which are not of low quality. We can extract these by filtering out the texts which did not pass the quality check." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_texts = [doc for doc in docs if doc._.passed_quality_check]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A total of 1000 texts were processed and 572 passed the quality check.\n" + ] + } + ], + "source": [ + "print(f\"A total of {len(docs)} texts were processed and {len(filtered_texts)} passed the quality check.\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Changing the filters\n", + "In some cases you might want to apply other filters. For instance the current filter sets a `symbol_to_word_ratio` threshold of 0.1 for hashtags `#`. This means that if a text contains a lot of hashtags it will be filtered out. However if you are working on e.g. tweets this is an unreasonable filter and you might want to adjust that. You can do this by overwriting the quality_thresholds:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "new_thresholds = td.QualityThresholds(\n", + " n_stop_words=(2, None), # at least 2 stop words, no upper bound\n", + " alpha_ratio= (0.7, None),\n", + " mean_word_length= (3, 10), # mean word length between 3 and 10 characters\n", + " doc_length = (10, 100_000),\n", + " symbol_to_word_ratio = {}, # don't filter based on symbol to word ratio.\n", + " proportion_ellipsis = (None, 0.3),\n", + " proportion_bullet_points = (None, 0.8),\n", + " contains = {\"lorem ipsum\": False}, # remove texts which contain the string \"lorem ipsum\"\n", + " duplicate_line_chr_fraction = (None, 0.2),\n", + " duplicate_paragraph_chr_fraction = (None, 0.2),\n", + " duplicate_ngram_chr_fraction = {}, # don't filter based on duplicate n-grams\n", + " top_ngram_chr_fraction = {\"2\": (None, 0.2), \"3\": (None, 0.18), \"4\": (None, 0.16)}\n", + ")\n", + "\n", + "# overwrite the existing thresholds\n", + "quality_pipe.set_quality_thresholds(new_thresholds)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to read more about what each argument does, please check out the [documentation](https://hlasse.github.io/TextDescriptives/quality.html#data-classes).\n", + "All the `passed` values and `passed_quality_check` attributes are dynamically updated when you can `.set_quality_thresholds`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check if the new text now pass the quality filter\n", + "doc._.passed_quality_check" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing Domains\n", + "\n", + "These quality metrics are heuristic based and need to be tuned. While the defaults are reasonable for some domains, they may not be for others. We will explore this a bit further in this section. These filters are specifically tuned for the web domain and this can lead to problems when applied directly to other domains.\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data\n", + "\n", + "For this we will use the [Danish Gigaword](https://sprogteknologi.dk/dataset/danish-gigaword) available on [Huggingface Datasets](DDSC/partial-danish-gigaword-no-twitter). For the purpose of this tutorial we will just use a small test version of it containing around 2500 examples, but you could easily change it to use the whole dataset. Danish Gigaword is a large collection of Danish texts collected from a variety of domains." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We can download the dataset using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration DDSC--partial-danish-gigaword-small-test-sample-6518b630de09688d\n", + "Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/DDSC___parquet/DDSC--partial-danish-gigaword-small-test-sample-6518b630de09688d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2c78220f7f1e4c119901389899b11a7b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcedoc_idLICENSEuridate_built
0Den fulde tekst Pressenævnets kendelse i sag n...retsinformationdkretsinformationdk_173889Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:51:31 2019 +0100
1Resume\\n\\nEfter at der var sket afskedigelser ...retsinformationdkretsinformationdk_39059Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:51:14 2019 +0100
2Resume\\n\\nContainere kunne ikke anses som genb...retsinformationdkretsinformationdk_15045Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:51:28 2019 +0100
3Resume\\n\\nEn forhandler ved »home-parties« af ...retsinformationdkretsinformationdk_37261Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:49:27 2019 +0100
4Den fulde tekst\\n\\nSkrivelse om lov om fleksyd...retsinformationdkretsinformationdk_19415Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:52:27 2019 +0100
5Resume\\n\\nResumé\\n\\nKlage over påbud om særlig...retsinformationdkretsinformationdk_31217Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:49:18 2019 +0100
6Resume\\n\\nResumé\\n\\nI en række af de af Danmar...retsinformationdkretsinformationdk_14387Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:49:49 2019 +0100
7Oversigt (indholdsfortegnelse)\\n\\nBilag 1\\n\\nD...retsinformationdkretsinformationdk_166197Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:49:44 2019 +0100
8Den fulde tekst\\n\\nBekendtgørelse om afregning...retsinformationdkretsinformationdk_76994Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:52:52 2019 +0100
9Den fulde tekst Ligebehandlingsnævnets afgørel...retsinformationdkretsinformationdk_192325Danish Copyright law at https://www.retsinform...https://www.retsinformation.dk/Forms/R0710.asp...Fri Nov 22 00:51:41 2019 +0100
\n", + "" + ], + "text/plain": [ + " text source \\\n", + "0 Den fulde tekst Pressenævnets kendelse i sag n... retsinformationdk \n", + "1 Resume\\n\\nEfter at der var sket afskedigelser ... retsinformationdk \n", + "2 Resume\\n\\nContainere kunne ikke anses som genb... retsinformationdk \n", + "3 Resume\\n\\nEn forhandler ved »home-parties« af ... retsinformationdk \n", + "4 Den fulde tekst\\n\\nSkrivelse om lov om fleksyd... retsinformationdk \n", + "5 Resume\\n\\nResumé\\n\\nKlage over påbud om særlig... retsinformationdk \n", + "6 Resume\\n\\nResumé\\n\\nI en række af de af Danmar... retsinformationdk \n", + "7 Oversigt (indholdsfortegnelse)\\n\\nBilag 1\\n\\nD... retsinformationdk \n", + "8 Den fulde tekst\\n\\nBekendtgørelse om afregning... retsinformationdk \n", + "9 Den fulde tekst Ligebehandlingsnævnets afgørel... retsinformationdk \n", + "\n", + " doc_id \\\n", + "0 retsinformationdk_173889 \n", + "1 retsinformationdk_39059 \n", + "2 retsinformationdk_15045 \n", + "3 retsinformationdk_37261 \n", + "4 retsinformationdk_19415 \n", + "5 retsinformationdk_31217 \n", + "6 retsinformationdk_14387 \n", + "7 retsinformationdk_166197 \n", + "8 retsinformationdk_76994 \n", + "9 retsinformationdk_192325 \n", + "\n", + " LICENSE \\\n", + "0 Danish Copyright law at https://www.retsinform... \n", + "1 Danish Copyright law at https://www.retsinform... \n", + "2 Danish Copyright law at https://www.retsinform... \n", + "3 Danish Copyright law at https://www.retsinform... \n", + "4 Danish Copyright law at https://www.retsinform... \n", + "5 Danish Copyright law at https://www.retsinform... \n", + "6 Danish Copyright law at https://www.retsinform... \n", + "7 Danish Copyright law at https://www.retsinform... \n", + "8 Danish Copyright law at https://www.retsinform... \n", + "9 Danish Copyright law at https://www.retsinform... \n", + "\n", + " uri \\\n", + "0 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "1 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "2 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "3 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "4 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "5 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "6 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "7 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "8 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "9 https://www.retsinformation.dk/Forms/R0710.asp... \n", + "\n", + " date_built \n", + "0 Fri Nov 22 00:51:31 2019 +0100 \n", + "1 Fri Nov 22 00:51:14 2019 +0100 \n", + "2 Fri Nov 22 00:51:28 2019 +0100 \n", + "3 Fri Nov 22 00:49:27 2019 +0100 \n", + "4 Fri Nov 22 00:52:27 2019 +0100 \n", + "5 Fri Nov 22 00:49:18 2019 +0100 \n", + "6 Fri Nov 22 00:49:49 2019 +0100 \n", + "7 Fri Nov 22 00:49:44 2019 +0100 \n", + "8 Fri Nov 22 00:52:52 2019 +0100 \n", + "9 Fri Nov 22 00:51:41 2019 +0100 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can take a look at one of the examples:\n", + "ten_samples = dataset.select(range(10))\n", + "ten_samples.to_pandas()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As previously mentioned, the Danish Gigaword corpus consist of multiple domains. For this tutorial, we will look at three of these domains. `retsinformationdk` which consists of legal documents, `hest` which contains post from a Danish debate forum ([heste-nettet.dk](https://www.heste-nettet.dk/)) and `spont` which contains texts transcribed from spontaneous speech." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cf05115807f14affa8d479778d1c466a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legal_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "legal_docs = list(legal_docs)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now inspect the output here:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Den fulde tekst Pressenævnets kendelse i sag nr. 15-70-00822\n", + "Resumé\n", + "Foreningen for Skånsomt Kystfiskeri har ikke retlig interesse\n", + "DR bragte et radioindslag om Natur- og Erhvervsstyrelsens fiskeriinspektorats fangst af ulovlige ålefælder. Foreningen for Skånsomt Kystfiskeri klagede blandt andet med den begrundelse, at betegnelsen ” ålefælder ” er forkert, idet ålene selv kan svømme ind og ud. Pressenævnet afviser at behandle klagen, da foreningen ikke er omtalt i udsendelsen og derfor ikke har retlig interesse.\n", + "Pressenævnets formand udtaler:\n", + "Det er en betingelse for at klage til Pressenævnet, at\n", + "----\n", + "This passed the quality filter:\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legal_doc = legal_docs[0]\n", + "\n", + "print(legal_doc[:100]) # print the first 100 tokens\n", + "print(\"----\")\n", + "print(\"This passed the quality filter:\")\n", + "legal_doc._.passed_quality_check" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see that the text did not pass the quality filter. We can now examine why that using the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "QualityOutput(passed=False, n_stop_words=ThresholdsOutput(value=192.0, passed=True, threshold=(2.0, None)), alpha_ratio=ThresholdsOutput(value=0.8, passed=True, threshold=(0.7, None)), mean_word_length=ThresholdsOutput(value=4.55, passed=True, threshold=(3.0, 10.0)), doc_length=ThresholdsOutput(value=500.0, passed=True, threshold=(10.0, 100000.0)), symbol_to_word_ratio={'#': ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.1))}, proportion_ellipsis=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.3)), proportion_bullet_points=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.8)), contains={'lorem ipsum': ThresholdsOutput(value=0.0, passed=True, threshold=False)}, duplicate_line_chr_fraction=ThresholdsOutput(value=0.26, passed=False, threshold=(None, 0.2)), duplicate_paragraph_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), duplicate_ngram_chr_fraction={'5': ThresholdsOutput(value=0.54, passed=False, threshold=(None, 0.15)), '6': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.14)), '7': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.13)), '8': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.12)), '9': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.11)), '10': ThresholdsOutput(value=0.52, passed=False, threshold=(None, 0.1))}, top_ngram_chr_fraction={'2': ThresholdsOutput(value=0.02, passed=True, threshold=(None, 0.2)), '3': ThresholdsOutput(value=0.04, passed=True, threshold=(None, 0.18)), '4': ThresholdsOutput(value=0.07, passed=True, threshold=(None, 0.16))})" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legal_doc._.quality" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see that fraction of characters which is a part of a duplicate 10 gram is >50%. This is a reason why the sample was filtered out. This is not uncommon for legal documents which contain a lot of standard phrases. However you might wish to change the threshold for this filter. We showed you have to do this in the previous section. We also see that the `alpha_ratio` is close 0.8. This means that the text is mostly made up of alphabetic characters. This is good, but as we will see later, this is not common for legal texts." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering out the text\n", + "Assuming we don't want to change the filters we can now use it to filter out the texts that we want to keep:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# 4. Filter out the documents that do not pass the quality\n", + "legal_docs_filtered = [doc for doc in legal_docs if doc._.passed_quality_check]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We had a total of 1000 which we filtered down to 264.\n" + ] + } + ], + "source": [ + "print(f\"We had a total of {len(legal['text'])} which we filtered down to {len(legal_docs_filtered)}.\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That seems like a lot, we should probably check why that is. We can do this by looking at the distribution of the scores of e.g. duplicate 10-gram fraction:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "\n", + "def get_duplicate_10_gram_fraction(doc):\n", + " quality = doc._.quality\n", + " duplicate_10_gram_fraction = quality.duplicate_ngram_chr_fraction[\"10\"]\n", + " return duplicate_10_gram_fraction.value\n", + "\n", + "duplicate_10_gram_fraction = [get_duplicate_10_gram_fraction(doc) for doc in legal_docs]\n", + "sns.histplot(duplicate_10_gram_fraction)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This seems like it explains a lot of the texts which were filtered out, but does not explain everything. Let us take a look at the `alpha_ratio` (the proportion of words which contains at least one alphabetic character) as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "alpha_ratio = [doc._.quality.alpha_ratio.value for doc in legal_docs]\n", + "sns.histplot(alpha_ratio)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that most of the text do not pass the `alpha_ratio` threshold of 0.7 or higher. This is not uncommon for legal documents as e.g. the paragraph sign `§` is not an alphabetic character. It might be relevant to change the threshold to 0.7 or lower." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparing across domains\n", + "We see that legal documents have quite a few perculiarities let us examine how the `alpha_ratio` behaves across different domains:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# first we apply the pipeline to the other domains\n", + "news_docs = nlp.pipe(news[\"text\"])\n", + "news_docs = list(news_docs)\n", + "speech_docs = nlp.pipe(speech[\"text\"])\n", + "speech_docs = list(speech_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# extract alpha ratio:\n", + "news_alpha_ratio = [doc._.quality.alpha_ratio.value for doc in news_docs]\n", + "speech_alpha_ratio = [doc._.quality.alpha_ratio.value for doc in speech_docs]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have the metrics we can plot a histogram comparing the metrics:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "# histogram\n", + "sns.histplot(news_alpha_ratio, label=\"News\", alpha=0.5, binwidth=0.05)\n", + "sns.histplot(alpha_ratio, label=\"Legal\", alpha=0.5, binwidth=0.05)\n", + "sns.histplot(speech_alpha_ratio, label=\"Speech\", alpha=0.5, binwidth=0.05)\n", + "\n", + "# add labels\n", + "plt.xlabel(\"Alpha ratio\")\n", + "plt.ylabel(\"Count\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see a couple of things:\n", + "- A fair amount of legal documents have an alpha ratio above 0.6.\n", + "- Almost no news text have a alpha ratio below 0.6.\n", + "- The alpha ratio for the Speech corpus is suspicously low\n", + "\n", + "Let us examine one of the speech samples a bit more in-depth:" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Taler 6: mm\n", + "Taler 7: er du klar?\n", + "Taler 6: ja\n", + "Taler 7: så er spørgsmålet om vi skal- om det er sådan her ja det kunne man godt okay\n", + "Taler 7: okay så det er ignore tab kill og kill tab\n", + "Taler 6: NA\n", + "Taler 6: kill\n", + "Taler 6: kill tab\n", + "Taler 7: super\n", + "Taler 7: okay det er det hun lige har sagt\n", + "Taler 6: ja\n", + "Taler 6: ja\n", + "Taler 6: NA\n" + ] + } + ], + "source": [ + "doc = speech_docs[0]\n", + "# examine the first 100 tokens in the first document\n", + "print(doc[:100])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From this we can see that a high proportion of the tokens in the speech dataset dentoes the speaker such and tokens such as `:` then lower the alpa ratio. This might or might not be problematic for the task at hand.\n", + "\n", + "**Therefore it is important to note that while these filters are useful for filtering large amount of texts it is also important to know that they should be adjusted to the target domain.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "textdescriptives", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31387647799921bb85032eec7bb02e281325ae7f8ffa6f9cd7cdead815b36c88" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 65072863..2087d0b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ tutorials = [ "jupyter", "seaborn", "matplotlib", + "datasets>=2.8.0,<2.9.0", ] [project.readme] diff --git a/src/textdescriptives/components/quality.py b/src/textdescriptives/components/quality.py index 70742d64..e4860d9d 100644 --- a/src/textdescriptives/components/quality.py +++ b/src/textdescriptives/components/quality.py @@ -1,107 +1,12 @@ """Component for calculating quality metrics.""" from collections import Counter, defaultdict -from functools import partial from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np -from pydantic import BaseModel, Field from spacy.language import Language from spacy.tokens import Doc, Span -Interval = Tuple[Optional[float], Optional[float]] - - -class QualityThresholds(BaseModel): - """Thresholds for quality metrics.""" - - n_stop_words: Interval = Field( - (2, None), - description="A Range for the number of stop words. Default: (2, None), i.e. " - + "at least 2 stop words, but no upper limit.", - ) - alpha_ratio: Interval = Field( - (0.8, None), - description="A Range for the alpha ratio. Default: (0.8, None), i.e. at " - + r"least 80% of tokens contain at least one alphabetic character, but no " - + "upper limit.", - ) - mean_word_length: Interval = Field( - (3, 10), - description="A Range for the mean word length. Default: (3, 10), i.e. between" - + " 3 and 10 characters.", - ) - doc_length: Interval = Field( - (10, 100_000), - description="A Range for the document length. Default: (10, 100_000), i.e." - + " between 10 and 100_000 characters.", - ) - symbol_to_word_ratio: Dict[str, Interval] = Field( - {"#": (None, 0.1)}, - description="A dict of symbols and the allowed range for the " - + r"symbol-to-word-ratio. The symbol-to-word-ratio is the ratio between symbol" - + "occurrence and word occurrence. Defaults to {'#': (None, 0.1)} i.e. no lower" - + r" limit, but there must at most be a ratio of 0.1 between the number of of " - + "words and hashtags. i.e. if we have 100 words the symbol should appear no " - + "more than 10 times. Values not in the dict are not checked.", - ) - proportion_ellipsis: Interval = Field( - (None, 0.3), - description="A Range for the proportion of lines which end with ellipsis. " - + "Default: (None, 0.3), " - + r"i.e. no lower limit, but at most 30% of lines end with an ellipsis.", - ) - proportion_bullet_points: Interval = Field( - (None, 0.8), - description="A Range for the proportion lines which start with a bullet " - + r"points. Default: (None, 0.8), i.e. no lower limit, but at most 80% of lines" - + " start with a bullet point.", - ) - contains: Dict[str, bool] = Field( - {"lorem ipsum": False}, - description="A dictionary of strings and whether they should be contained in " - + "the document. Default: {'lorem ipsum': False}, i.e. the document should not" - + " contain the string 'lorem ipsum'.", - ) - duplicate_line_chr_fraction: Interval = Field( - (None, 0.2), - description="A Range for the duplicate line character fraction. Default: " - + r"(None, 0.2), i.e. no lower limit, but at most 20% of characters are" - + " duplicates.", - ) - duplicate_paragraph_chr_fraction: Interval = Field( - (None, 0.2), - description="A Range for the duplicate paragraph character fraction. Default:" - + r" (None, 0.2), i.e. no lower limit, but at most 20% of characters are " - + "duplicates.", - ) - duplicate_ngram_chr_fraction: Dict[str, Interval] = Field( - { - "5": (None, 0.15), - "6": (None, 0.14), - "7": (None, 0.13), - "8": (None, 0.12), - "9": (None, 0.11), - "10": (None, 0.1), - }, - description="A dictionary of n-gram lengths and the allowed range for the " - + "duplicate n-gram character fraction. Default: {5: (None, 0.15), 6: (None, " - + "0.14), 7: (None, 0.13), 8: (None, 0.12), 9: (None, 0.11), 10: (None, 0.1)}, " - + r"i.e. no lower limit, but at most 15% of characters are duplicates for " - + r"5-grams, 14% for 6-grams, 13% for 7-grams, 12% for 8-grams, 11% for 9-grams" - + r" and 10% for 10-grams.", - ) - top_ngram_chr_fraction: Dict[str, Interval] = Field( - { - "2": (None, 0.2), - "3": (None, 0.18), - "4": (None, 0.16), - }, - description="A dictionary of n-gram lengths and the allowed range for the " - + "top n-gram character fraction. Default: {2: (None, 0.2), 3: (None, 0.18)" - + r", 4: (None, 0.16)}, i.e. no lower limit, but at most 20% of characters " - + r"are contained within a duplicate for 2-grams, 18% for 3-grams and 16% " - + "for 4-grams.", - ) +from .quality_data_classes import QualityOutput, QualityThresholds, ThresholdsOutput def n_stop_words(span: Union[Doc, Span]) -> int: @@ -461,7 +366,28 @@ def __init__( # pylint: disable=dangerous-default-value quality_thresholds = QualityThresholds() self.quality_thresholds = quality_thresholds - self.getters = { + self.set_extensions() + + def quality_setter( + self, + span: Union[Span, Doc], + ) -> QualityOutput: + """Apply quality functions to doc. + + Args: + span (Union[Span, Doc]): spaCy span or doc object + + Returns: + QualityOutput: The quality metrics + """ + threshold = self.quality_thresholds + + thresholds_outputs: Dict[ + str, + Union[Dict[str, ThresholdsOutput], ThresholdsOutput], + ] = {} + # filter with only one threshold + getters = { # heuristic quality filters "n_stop_words": n_stop_words, "alpha_ratio": alpha_ratio, @@ -472,155 +398,136 @@ def __init__( # pylint: disable=dangerous-default-value # text repetition "duplicate_line_chr_fraction": duplicate_line_chr_fraction, "duplicate_paragraph_chr_fraction": duplicate_paragraph_chr_fraction, - "duplicate_ngram_chr_fraction": partial( - duplicate_ngram_fraction, - ngram_range=duplicate_n_gram_fraction_range, - ), - "top_ngram_chr_fraction": partial( - top_ngram_chr_fraction, - ngram_range=top_ngram_range, - min_count=top_ngram_min_count, - ), } - # add symbol to word ratio - for symbol in symbols: - self.getters[f"symbol_{symbol}_to_word_ratio"] = partial( - symbol_to_word_ratio, - symbol=symbol, + + for name, getter in getters.items(): + thresholds_outputs[name] = ThresholdsOutput( + value=getter(span), # type: ignore + threshold=getattr(threshold, name), ) - # add contains - for string in contains: - self.getters[f"contains_{string}"] = partial(contains_string, string=string) - self.extensions = { - "passed_quality_check": self.passed_quality_thresholds, - "quality": self.quality_getter, + thresholds_outputs["contains"] = { + string: ThresholdsOutput( + value=contains_string(span, string), + threshold=threshold.contains.get(string, None), + ) + for string in self.contains + } + thresholds_outputs["symbol_to_word_ratio"] = { + symbol: ThresholdsOutput( + value=symbol_to_word_ratio(span, symbol), + threshold=threshold.symbol_to_word_ratio.get(symbol, None), + ) + for symbol in self.symbols } - self.set_extensions() + chr_frac = top_ngram_chr_fraction( + span, + ngram_range=self.top_ngram_range, + min_count=self.top_ngram_min_count, + ) - def quality_getter(self, span: Span) -> Dict[str, Union[float, int, bool]]: - """Apply quality functions to doc. + thresholds_outputs["top_ngram_chr_fraction"] = { + str(n_gram): ThresholdsOutput( + value=frac, + threshold=threshold.top_ngram_chr_fraction.get( + str(n_gram), + (None, None), + ), + ) + for n_gram, frac in chr_frac.items() + } + + duplicate_ngram_chr_fraction = duplicate_ngram_fraction( + span, + ngram_range=self.duplicate_n_gram_fraction_range, + ) + thresholds_outputs["duplicate_ngram_chr_fraction"] = { + str(n_gram): ThresholdsOutput( + value=frac, + threshold=threshold.duplicate_ngram_chr_fraction.get( + str(n_gram), + (None, None), + ), + ) + for n_gram, frac in duplicate_ngram_chr_fraction.items() + } + + return QualityOutput(**thresholds_outputs) + + def quality_getter(self, span: Union[Span, Doc]) -> QualityOutput: + """Get quality metrics from doc. Args: - span (Span): spaCy span object + span (Union[Span, Doc]): spaCy span or doc object Returns: - Dict[str, Union[float, int, bool]]: dictionary of quality metrics + QualityOutput: The quality metrics """ - quality = {} - for name, getter in self.getters.items(): - if name == "top_ngram_chr_fraction": - chr_frac = getter(span) # type: ignore - for n_gram, frac in chr_frac.items(): - quality[f"top_{n_gram}-gram_chr_fraction"] = frac - elif name == "duplicate_ngram_chr_fraction": - chr_frac = getter(span) # type: ignore - for n_gram, frac in chr_frac.items(): - quality[f"duplicate_{n_gram}-gram_chr_fraction"] = frac - else: - quality[name] = getter(span) # type: ignore - return quality + if not hasattr(span._, "_quality"): + return self.quality_setter(span) + return QualityOutput(**span._._quality) - @staticmethod - def is_within_range(rangetuple: Interval, value: float) -> bool: - """Check if a value is within a range tuple. If one of the values in - the range tuple is None it is considered to be unbounded. + def set_quality(self, doc: Doc) -> None: + """Set the quality attribute on a doc. Args: - rangetuple (Interval): range tuple - value (float): value to check + doc (Doc): spaCy doc object + """ + # to allow the variable to json serializable we convert it to json + # it is then converted back into a quality output object in the getter + + doc._._quality = self.quality_setter(doc).dict() + doc._.passed_quality_check = self.passed_quality_thresholds(doc) + + def passed_quality_thresholds(self, span: Union[Span, Doc]) -> bool: + """Check if a span passes the quality thresholds. + + Args: + span (Union[Span, Doc]): spaCy span or doc object Returns: - bool: True if value is within range + bool: True if span passes quality thresholds """ - return (rangetuple[0] is None or rangetuple[0] <= value) and ( - rangetuple[1] is None or value <= rangetuple[1] - ) - - def passed_quality_thresholds(self, span: Span) -> bool: - """Checks whether a span passed the quality thresholds.""" - quality = span._.quality - qt = self.quality_thresholds - - # heuristic quality filters - if not self.is_within_range(qt.n_stop_words, quality["n_stop_words"]): - return False - if not self.is_within_range(qt.alpha_ratio, quality["alpha_ratio"]): - return False - if not self.is_within_range(qt.mean_word_length, quality["mean_word_length"]): - return False - if not self.is_within_range(qt.doc_length, quality["doc_length"]): - return False - if not self.is_within_range( - qt.proportion_ellipsis, - quality["proportion_ellipsis"], - ): - return False - if not self.is_within_range( - qt.proportion_bullet_points, - quality["proportion_bullet_points"], - ): - return False - - for symbol in self.symbols: - if symbol in qt.symbol_to_word_ratio: - if not self.is_within_range( - qt.symbol_to_word_ratio[symbol], - quality[f"symbol_{symbol}_to_word_ratio"], - ): - return False - - for string in self.contains: - if string in qt.contains and ( - qt.contains[string] is not quality[f"contains_{string}"] - ): - return False - - # text repetition - if not self.is_within_range( - qt.duplicate_line_chr_fraction, - quality["duplicate_line_chr_fraction"], - ): - return False - if not self.is_within_range( - qt.duplicate_paragraph_chr_fraction, - quality["duplicate_paragraph_chr_fraction"], - ): - return False - - for ngram in qt.duplicate_ngram_chr_fraction: - key = f"duplicate_{ngram}-gram_chr_fraction" - if key in quality: - if not self.is_within_range( - qt.duplicate_ngram_chr_fraction[ngram], - quality[key], - ): - return False - - for n_gram in qt.top_ngram_chr_fraction: - if n_gram in quality: - if not self.is_within_range( - qt.top_ngram_chr_fraction[n_gram], - quality[n_gram], - ): - return False - - return True + quality_output = self.quality_getter(span) + return quality_output.passed def set_extensions(self): """Set required extensions.""" - for ext_name, span_getter in self.extensions.items(): - # doc_getter = span_getter_to_doc_getter(span_getter) + ext_name = "passed_quality_check" + if not Span.has_extension(ext_name) or self.force is True: + Span.set_extension( + ext_name, + getter=self.passed_quality_thresholds, + force=True, + ) + if not Doc.has_extension(ext_name) or self.force is True: + Doc.set_extension( + ext_name, + getter=self.passed_quality_thresholds, + force=True, + ) - if not Span.has_extension(ext_name) or self.force is True: - Span.set_extension(ext_name, getter=span_getter, force=True) - if not Doc.has_extension(ext_name) or self.force is True: - Doc.set_extension(ext_name, getter=span_getter, force=True) + ext_name = "quality" + if not Doc.has_extension(ext_name) or self.force is True: + Doc.set_extension(ext_name, getter=self.quality_getter, force=True) + Doc.set_extension("_" + ext_name, default=None, force=True) + if not Span.has_extension(ext_name) or self.force is True: + Span.set_extension(ext_name, getter=self.quality_getter, force=True) + Span.set_extension("_" + ext_name, default=None, force=True) + + def set_quality_thresholds(self, thresholds: QualityThresholds) -> None: + """Sets the quality thresholds. + + Args: + thresholds (QualityThresholds): The desired quality thresholds. + """ + self.quality_thresholds = thresholds def __call__(self, doc: Doc): """Run the pipeline component.""" + self.set_quality(doc) return doc @@ -639,7 +546,6 @@ def __call__(self, doc: Doc): "top_ngram_min_count": 3, "duplicate_n_gram_fraction_range": [5, 10], "force": True, - "quality_thresholds": None, }, ) def create_quality_component( @@ -650,7 +556,6 @@ def create_quality_component( top_ngram_range: Tuple[int, int], top_ngram_min_count: int, duplicate_n_gram_fraction_range: Tuple[int, int], - quality_thresholds: Optional[dict] = None, force: bool = True, ) -> Callable[[Doc], Doc]: """Allows Quality to be added to a spaCy pipe using @@ -695,12 +600,6 @@ def create_quality_component( be considered a top n-gram. Defaults to 3. duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. - quality_thresholds (Optional[dict]): A dictionary object containing the - thresholds indicated by either an interval (Tuple) or a boolean. We - recommend using the QualityThresholds class to create this dictionary by - calling QualityThresholds(...).dict(). This ensures that all the thresholds - are validated. Defaults to None in which case the default for - QualityThresholds is used. force (bool): whether to overwrite existing extensions. Defaults to True. @@ -718,13 +617,6 @@ def create_quality_component( >>> # check whether the document passed the quality thresholds >>> doc._.passed_quality_check """ - # recons quality_thresholds since it needs to be json serializable for the config - # in the nlp.add_pipe call - if quality_thresholds is not None: - quality_thresholds_ = QualityThresholds(**quality_thresholds) - else: - quality_thresholds_ = None - return Quality( nlp, name=name, @@ -733,6 +625,6 @@ def create_quality_component( top_ngram_range=top_ngram_range, top_ngram_min_count=top_ngram_min_count, duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range, - quality_thresholds=quality_thresholds_, + quality_thresholds=None, force=force, ) diff --git a/src/textdescriptives/components/quality_data_classes.py b/src/textdescriptives/components/quality_data_classes.py new file mode 100644 index 00000000..840d203f --- /dev/null +++ b/src/textdescriptives/components/quality_data_classes.py @@ -0,0 +1,259 @@ +"""Data classes used for the quality component.""" +from typing import Any, Dict, Optional, Tuple, Union + +from pydantic import BaseModel, Extra, Field + +Interval = Tuple[Optional[float], Optional[float]] + + +class ThresholdsOutput(BaseModel): + """An output which contains an three items. 1) a thresholds which is either + an interval or a accepted boolean value. 2) a value which is the value of + the metric. 3) a boolean which is True if the value is within the + thresholds. + + Example: + >>> t_out = ThresholdsOutput(threshold=(0, 2), value=2) + >>> t_out + ThresholdsOutput(value=2.0, passed=True, threshold=(0.0, 2.0)) + >>> t_out.passed + True + """ + + class Config: + extra = Extra.forbid + + threshold: Union[Interval, bool, None] + value: float + + @property + def passed(self) -> bool: + """Return True if the value is within the thresholds.""" + if self.threshold is None: + return True + if isinstance(self.threshold, bool): + return self.threshold == self.value + lower, upper = self.threshold + return (lower is None or lower <= self.value) and ( + upper is None or self.value <= upper + ) + + def __repr_str__(self, join_str: str) -> str: + value = round(self.value, 2) if isinstance(self.value, float) else self.value + return join_str.join( + repr(v) if a is None else f"{a}={v!r}" + for a, v in [ + ("value", value), + ("passed", self.passed), + ("threshold", self.threshold), + ] + ) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, ThresholdsOutput): + return self.value == other.value and self.threshold == other.threshold + return self.value == other + + +class QualityThresholds(BaseModel): + """Thresholds for quality metrics.""" + + class Config: + extra = Extra.forbid + + n_stop_words: Interval = Field( + (2, None), + description="A Range for the number of stop words. Default: (2, None), i.e. " + + "at least 2 stop words, but no upper limit.", + ) + alpha_ratio: Interval = Field( + (0.7, None), + description="A Range for the alpha ratio. Default: (0.7, None), i.e. at " + + r"least 70% of tokens contain at least one alphabetic character, but no " + + "upper limit. Note this is lowered from the original 0.8 to account for a" + + "different definition of word boundaries. E.g. in spaCy a punctuation is" + + "not a part of a word.", + ) + mean_word_length: Interval = Field( + (3, 10), + description="A Range for the mean word length. Default: (3, 10), i.e. between" + + " 3 and 10 characters.", + ) + doc_length: Interval = Field( + (10, 100_000), + description="A Range for the document length. Default: (10, 100_000), i.e." + + " between 10 and 100_000 characters.", + ) + symbol_to_word_ratio: Dict[str, Interval] = Field( + {"#": (None, 0.1)}, + description="A dict of symbols and the allowed range for the " + + r"symbol-to-word-ratio. The symbol-to-word-ratio is the ratio between symbol" + + "occurrence and word occurrence. Defaults to {'#': (None, 0.1)} i.e. no lower" + + r" limit, but there must at most be a ratio of 0.1 between the number of of " + + "words and hashtags. i.e. if we have 100 words the symbol should appear no " + + "more than 10 times. Values not in the dict are not checked.", + ) + proportion_ellipsis: Interval = Field( + (None, 0.3), + description="A Range for the proportion of lines which end with ellipsis. " + + "Default: (None, 0.3), " + + r"i.e. no lower limit, but at most 30% of lines end with an ellipsis.", + ) + proportion_bullet_points: Interval = Field( + (None, 0.8), + description="A Range for the proportion lines which start with a bullet " + + r"points. Default: (None, 0.8), i.e. no lower limit, but at most 80% of lines" + + " start with a bullet point.", + ) + contains: Dict[str, bool] = Field( + {"lorem ipsum": False}, + description="A dictionary of strings and whether they should be contained in " + + "the document. Default: {'lorem ipsum': False}, i.e. the document should not" + + " contain the string 'lorem ipsum'.", + ) + duplicate_line_chr_fraction: Interval = Field( + (None, 0.2), + description="A Range for the duplicate line character fraction. Default: " + + r"(None, 0.2), i.e. no lower limit, but at most 20% of characters are" + + " duplicates.", + ) + duplicate_paragraph_chr_fraction: Interval = Field( + (None, 0.2), + description="A Range for the duplicate paragraph character fraction. Default:" + + r" (None, 0.2), i.e. no lower limit, but at most 20% of characters are " + + "duplicates.", + ) + duplicate_ngram_chr_fraction: Dict[str, Interval] = Field( + { + "5": (None, 0.15), + "6": (None, 0.14), + "7": (None, 0.13), + "8": (None, 0.12), + "9": (None, 0.11), + "10": (None, 0.1), + }, + description="A dictionary of n-gram lengths and the allowed range for the " + + "duplicate n-gram character fraction. Default: {5: (None, 0.15), 6: (None, " + + "0.14), 7: (None, 0.13), 8: (None, 0.12), 9: (None, 0.11), 10: (None, 0.1)}, " + + r"i.e. no lower limit, but at most 15% of characters are duplicates for " + + r"5-grams, 14% for 6-grams, 13% for 7-grams, 12% for 8-grams, 11% for 9-grams" + + r" and 10% for 10-grams.", + ) + top_ngram_chr_fraction: Dict[str, Interval] = Field( + { + "2": (None, 0.2), + "3": (None, 0.18), + "4": (None, 0.16), + }, + description="A dictionary of n-gram lengths and the allowed range for the " + + "top n-gram character fraction. Default: {2: (None, 0.2), 3: (None, 0.18)" + + r", 4: (None, 0.16)}, i.e. no lower limit, but at most 20% of characters " + + r"are contained within a duplicate for 2-grams, 18% for 3-grams and 16% " + + "for 4-grams.", + ) + + +class QualityOutput(BaseModel): + """The output of the quality function.""" + + class Config: + extra = Extra.forbid + + n_stop_words: ThresholdsOutput = Field( + ..., + description="The thresholds output for the number of stop words.", + ) + alpha_ratio: ThresholdsOutput = Field( + ..., + description="The thresholds output for the alpha ratio.", + ) + mean_word_length: ThresholdsOutput = Field( + ..., + description="The thresholds output for the mean word length.", + ) + doc_length: ThresholdsOutput = Field( + ..., + description="The thresholds output for the document length.", + ) + symbol_to_word_ratio: Dict[str, ThresholdsOutput] = Field( + ..., + description="The thresholds output for the symbol-to-word-ratio.", + ) + proportion_ellipsis: ThresholdsOutput = Field( + ..., + description="The thresholds output for the proportion of lines ending with " + + "ellipsis.", + ) + proportion_bullet_points: ThresholdsOutput = Field( + ..., + description="The thresholds output for the proportion of lines starting with " + + "bullet points.", + ) + contains: Dict[str, ThresholdsOutput] = Field( + ..., + description="The thresholds output for the presence of strings.", + ) + duplicate_line_chr_fraction: ThresholdsOutput = Field( + ..., + description="The thresholds output for the duplicate line character fraction.", + ) + duplicate_paragraph_chr_fraction: ThresholdsOutput = Field( + ..., + description="The thresholds output for the duplicate paragraph character " + + "fraction.", + ) + duplicate_ngram_chr_fraction: Dict[str, ThresholdsOutput] = Field( + ..., + description="The thresholds output for the duplicate n-gram character " + + "fraction.", + ) + top_ngram_chr_fraction: Dict[str, ThresholdsOutput] = Field( + ..., + description="The thresholds output for the top n-gram character fraction.", + ) + + @property + def passed(self) -> bool: + """ + Returns: + bool: Whether all thresholds have been passed. + """ + return all( + [ + self.n_stop_words.passed, + self.alpha_ratio.passed, + self.mean_word_length.passed, + self.doc_length.passed, + all(v.passed for v in self.symbol_to_word_ratio.values()), + self.proportion_ellipsis.passed, + self.proportion_bullet_points.passed, + all(v.passed for v in self.contains.values()), + self.duplicate_line_chr_fraction.passed, + self.duplicate_paragraph_chr_fraction.passed, + all(v.passed for v in self.duplicate_ngram_chr_fraction.values()), + all(v.passed for v in self.top_ngram_chr_fraction.values()), + ], + ) + + def __repr_str__(self, join_str: str) -> str: + return join_str.join( + repr(v) if a is None else f"\n\t{a}={v!r}" + for a, v in [ + ("passed", self.passed), + ] + + list(self.__repr_args__()) + ) + + def to_flat_value_dict(self) -> Dict[str, Any]: + """Creates a flat dictionary representation of the object to allow for + easy easy conversion to a pandas DataFrame.""" + flat_dict = {"passed_quality_check": self.passed} + + for k, v in self.__dict__.items(): + if isinstance(v, dict): + for k2, v2 in v.items(): + flat_dict[f"{k}_{k2}"] = v2.value + else: + flat_dict[k] = v.value + + return flat_dict diff --git a/src/textdescriptives/extractors.py b/src/textdescriptives/extractors.py index 1dea6ead..540cfb1d 100644 --- a/src/textdescriptives/extractors.py +++ b/src/textdescriptives/extractors.py @@ -14,7 +14,7 @@ def __get_quality(doc: Doc) -> dict: """Get quality metrics as well as boolean indicator for passing filters.""" - return {**doc._.quality, "passed_quality_check": doc._.passed_quality_check} + return doc._.quality.to_flat_value_dict() def __get_descriptive_stats_dict(doc: Doc) -> dict: diff --git a/tests/test_quality.py b/tests/test_quality.py index f042240b..4778a4e8 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -4,7 +4,6 @@ import pytest import spacy - import textdescriptives as td from textdescriptives.components.quality import ( alpha_ratio, @@ -181,15 +180,17 @@ def test_quality_component(nlp: spacy.Language): """Test the quality component.""" nlp.add_pipe("textdescriptives/quality", config={"force": True}) doc = nlp("This is a test. This is a test. This is a test.") - assert doc._.quality["n_stop_words"] == 9 - assert doc._.quality["mean_word_length"] == 2.4 - assert doc._.quality["alpha_ratio"] == 0.8 - assert doc._.quality["proportion_bullet_points"] == 0 - assert doc._.quality["proportion_ellipsis"] == 0 - assert doc._.quality["symbol_#_to_word_ratio"] == 0 - assert doc._.quality["duplicate_5-gram_chr_fraction"] == 1 - assert abs(doc._.quality["top_2-gram_chr_fraction"] - 0.44) < 0.01 + quality = doc._.quality + assert quality.n_stop_words == 9 + assert quality.mean_word_length == 2.4 + assert quality.alpha_ratio == 0.8 + assert quality.proportion_bullet_points == 0 + assert quality.proportion_ellipsis == 0 + assert quality.symbol_to_word_ratio["#"] == 0 + assert quality.duplicate_ngram_chr_fraction["5"] == 1 + assert abs(quality.top_ngram_chr_fraction["2"].value - 0.44) < 0.01 assert doc._.passed_quality_check is False + assert quality.passed is False def test_quality_component_with_config(nlp: spacy.Language): @@ -200,7 +201,7 @@ def test_quality_component_with_config(nlp: spacy.Language): alpha_ratio=(None, 0.8), mean_word_length=(1, 10), doc_length=(10, 100_000), - symbols_to_word_ratio={".": (None, 0.3)}, + symbol_to_word_ratio={".": (None, 0.3)}, proportion_ellipsis=(None, 0.3), proportion_bullet_points=(None, 0.8), duplicate_line_chr_fraction=(None, 0.2), @@ -210,25 +211,25 @@ def test_quality_component_with_config(nlp: spacy.Language): contains={"lorem ipsum": False}, ) - nlp.add_pipe( + quality_pipe = nlp.add_pipe( "textdescriptives/quality", config={ "symbols": ["."], - "quality_thresholds": quality_thresholds.dict(), "force": True, }, ) + quality_pipe.set_quality_thresholds(quality_thresholds) doc = nlp("This is a test. This is a test. This is a test.") - assert doc._.quality["n_stop_words"] == 9 - assert doc._.quality["mean_word_length"] == 2.4 - assert doc._.quality["alpha_ratio"] == 0.8 - assert doc._.quality["proportion_bullet_points"] == 0 - assert doc._.quality["proportion_ellipsis"] == 0 - assert doc._.quality["symbol_._to_word_ratio"] == 0.25 - assert doc._.quality["duplicate_5-gram_chr_fraction"] == 1 - assert doc._.quality["duplicate_8-gram_chr_fraction"] == 1 - assert abs(doc._.quality["top_3-gram_chr_fraction"] - 0.57) < 0.01 + assert doc._.quality.n_stop_words == 9 + assert doc._.quality.mean_word_length == 2.4 + assert doc._.quality.alpha_ratio == 0.8 + assert doc._.quality.proportion_bullet_points == 0 + assert doc._.quality.proportion_ellipsis == 0 + assert doc._.quality.symbol_to_word_ratio["."] == 0.25 + assert doc._.quality.duplicate_ngram_chr_fraction["5"] == 1 + assert doc._.quality.duplicate_ngram_chr_fraction["8"] == 1 + assert abs(doc._.quality.top_ngram_chr_fraction["3"].value - 0.57) < 0.01 assert doc._.passed_quality_check is True @@ -261,7 +262,7 @@ def test_quality_multi_process(nlp): "A couple of texts here, yeah yeah yeah.", "This is a second text, no repetition what so ever.", ] - + nlp.add_pipe("textdescriptives/quality", config={"force": True}) docs = nlp.pipe(texts, n_process=2) for doc in docs: assert doc._.quality