From 3574de581d78e37845f24bf8fde425d6cddfa644 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Thu, 18 Jul 2024 18:30:45 +0200 Subject: [PATCH 01/12] init --- .coveragerc | 4 + .gitignore | 3 + Makefile | 6 +- docs/index.md | 17 - examples/example.py | 138 ++++++++ gen_ref_pages.py | 26 ++ mkdocs.yml | 6 + requirements-test.txt | 5 + requirements.txt | 7 +- streamauc/__init__.py | 5 + streamauc/__main__.py | 6 - streamauc/base.py | 17 - streamauc/cli.py | 28 -- streamauc/documentation.md | 18 + streamauc/metrics/__init__.py | 33 ++ streamauc/metrics/_accuracy.py | 85 +++++ streamauc/metrics/_f1_score.py | 69 ++++ streamauc/metrics/_fnr.py | 74 ++++ streamauc/metrics/_fpr.py | 200 +++++++++++ streamauc/metrics/_jaccard_index.py | 70 ++++ streamauc/metrics/_precision.py | 195 +++++++++++ streamauc/metrics/_tnr.py | 75 ++++ streamauc/metrics/_tpr.py | 194 +++++++++++ streamauc/metrics/metric_synonyms.py | 132 +++++++ streamauc/plot_util.py | 131 +++++++ streamauc/streaming_metrics.py | 502 +++++++++++++++++++++++++++ streamauc/utils.py | 146 ++++++++ tests/metrics/__init__.py | 0 tests/metrics/test_accuracy.py | 179 ++++++++++ tests/metrics/test_f1_score.py | 171 +++++++++ tests/metrics/test_fnr.py | 70 ++++ tests/metrics/test_fpr.py | 216 ++++++++++++ tests/metrics/test_jaccard.py | 159 +++++++++ tests/metrics/test_precision.py | 226 ++++++++++++ tests/metrics/test_tnr.py | 71 ++++ tests/metrics/test_tpr.py | 211 +++++++++++ tests/test_base.py | 5 - tests/test_streaming_metrics.py | 419 ++++++++++++++++++++++ tests/utils/__init__.py | 0 tests/utils/test_plots.py | 117 +++++++ tests/utils/test_util.py | 71 ++++ 41 files changed, 4026 insertions(+), 81 deletions(-) create mode 100644 .coveragerc delete mode 100644 docs/index.md create mode 100644 examples/example.py create mode 100644 gen_ref_pages.py delete mode 100644 streamauc/__main__.py delete mode 100644 streamauc/base.py delete mode 100644 streamauc/cli.py create mode 100644 streamauc/documentation.md create mode 100644 streamauc/metrics/__init__.py create mode 100644 streamauc/metrics/_accuracy.py create mode 100644 streamauc/metrics/_f1_score.py create mode 100644 streamauc/metrics/_fnr.py create mode 100644 streamauc/metrics/_fpr.py create mode 100644 streamauc/metrics/_jaccard_index.py create mode 100644 streamauc/metrics/_precision.py create mode 100644 streamauc/metrics/_tnr.py create mode 100644 streamauc/metrics/_tpr.py create mode 100644 streamauc/metrics/metric_synonyms.py create mode 100644 streamauc/plot_util.py create mode 100644 streamauc/streaming_metrics.py create mode 100644 streamauc/utils.py create mode 100644 tests/metrics/__init__.py create mode 100644 tests/metrics/test_accuracy.py create mode 100644 tests/metrics/test_f1_score.py create mode 100644 tests/metrics/test_fnr.py create mode 100644 tests/metrics/test_fpr.py create mode 100644 tests/metrics/test_jaccard.py create mode 100644 tests/metrics/test_precision.py create mode 100644 tests/metrics/test_tnr.py create mode 100644 tests/metrics/test_tpr.py delete mode 100644 tests/test_base.py create mode 100644 tests/test_streaming_metrics.py create mode 100644 tests/utils/__init__.py create mode 100644 tests/utils/test_plots.py create mode 100644 tests/utils/test_util.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..b73102b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +branch = True +omit = + streamauc/metrics/metric_synonyms.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2d0fadb..eafd570 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json # templates .github/templates/* + + +.idea \ No newline at end of file diff --git a/Makefile b/Makefile index ef47815..7947896 100644 --- a/Makefile +++ b/Makefile @@ -32,10 +32,9 @@ fmt: ## Format code using black & isort. .PHONY: lint lint: ## Run pep8, black, mypy linters. - $(ENV_PREFIX)flake8 streamauc/ + $(ENV_PREFIX)flake8 --per-file-ignores="__init__.py:F401" streamauc/ $(ENV_PREFIX)black -l 79 --check streamauc/ $(ENV_PREFIX)black -l 79 --check tests/ - $(ENV_PREFIX)mypy --ignore-missing-imports streamauc/ .PHONY: test test: lint ## Run tests and generate coverage report. @@ -90,7 +89,8 @@ release: ## Create a new tag for release. .PHONY: docs docs: ## Build the documentation. @echo "building documentation ..." - @$(ENV_PREFIX)mkdocs build + @$(ENV_PREFIX)pdoc3 streamauc -o site/ --html --force -c latex_math=True + @mv site/streamauc/* site URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL || open $$URL .PHONY: switch-to-poetry diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 000ea34..0000000 --- a/docs/index.md +++ /dev/null @@ -1,17 +0,0 @@ -# Welcome to MkDocs - -For full documentation visit [mkdocs.org](https://www.mkdocs.org). - -## Commands - -* `mkdocs new [dir-name]` - Create a new project. -* `mkdocs serve` - Start the live-reloading docs server. -* `mkdocs build` - Build the documentation site. -* `mkdocs -h` - Print help message and exit. - -## Project layout - - mkdocs.yml # The configuration file. - docs/ - index.md # The documentation homepage. - ... # Other markdown pages, images and other files. diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 0000000..173f26b --- /dev/null +++ b/examples/example.py @@ -0,0 +1,138 @@ +""" +Example on how to use the library for tracking metrics. +For simplicity, sklearn is used as example model, with the iris dataset. + +A large dataset at test time is simulated by resampling... + +The fitting of the iris data is taken from the sklearn examples to +multiclass ROC, see: +https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html + + +For running this, you need to additionally install scikit-learn, tqdm, +and matplotlib (or just everything in the requirements-test.txt). + +""" +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import LabelBinarizer + +from tqdm import tqdm +import matplotlib.pyplot as plt + +from streamauc import StreamingMetrics, AggregationMethod, auc +from streamauc import metrics + +np.random.seed(1234) + +############################################################################## +# setup data +############################################################################## +iris = load_iris() +X, y = iris.data, iris.target_names[iris.target] + +random_state = np.random.RandomState(0) +n_samples, n_features = X.shape +X = np.concatenate( + [X, random_state.randn(n_samples, 200 * n_features)], axis=1 +) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, stratify=y, random_state=0 +) +classifier = LogisticRegression(max_iter=1000) +y_score = classifier.fit(X_train, y_train).predict_proba(X_test) + +label_binarizer = LabelBinarizer().fit(y_train) +y_test = np.argmax(label_binarizer.transform(y_test), -1) + +### make test dataset absurdely large +idx = np.arange(y_test.shape[0]) +idx_resampled = np.random.choice(idx, 100_000) +y_test = y_test[idx_resampled] +X_test = X_test[idx_resampled] +X_test += np.random.randn(*X_test.shape) * 0.1 + +############################################################################## +# simulate minibatches at test time +############################################################################## + +# Select the number of thresholds for which we want to keep track of results. +stream_metrics = StreamingMetrics( + num_thresholds=100, + num_classes=3, +) + +mb_size = 10_000 +num_mbs = X_test.shape[0] // mb_size +for i in tqdm(range(num_mbs)): + mb_X = X_test[mb_size * i:mb_size * (i + 1)] + mb_y = y_test[mb_size * i:mb_size * (i + 1)] + + y_pred = classifier.predict_proba(mb_X) + stream_metrics.update(y_true=mb_y, y_score=y_pred) + + +# ###### METRICS +# get AUC for any combination of metrics with +# micro, macro or no aggregation (i.e. onevsall). +_auc_onevsall = stream_metrics.auc(metric_xaxis=metrics.recall, + metric_yaxis=metrics.precision, + method=AggregationMethod.ONE_VS_ALL) +_auc_micro = stream_metrics.auc(metric_xaxis=metrics.recall, + metric_yaxis=metrics.precision, + method=AggregationMethod.MICRO) +_auc_macro = stream_metrics.auc(metric_xaxis=metrics.recall, + metric_yaxis=metrics.precision, + method=AggregationMethod.MACRO) + +print("One VS All AUC of Precision Recall:") +print(_auc_onevsall) +print("Micro Averaged AUC of Precision Recall:") +print(_auc_micro) +print("Macro Averaged AUC of Precision Recall:") +print(_auc_macro) + + +# get metrics such as F1 at all thresholds +f1_scores = stream_metrics.calc_metric(metric=metrics.f1_score) +plt.plot(stream_metrics.thresholds, f1_scores) +plt.xlabel("Threshold") +plt.ylabel("F1 Score") +plt.show() + + +# ##### PERFORMANCE CURVES +# # plot one vs all precision recall curve for all classes +fig = stream_metrics.plot_precision_recall_curve(class_names=iris.target_names, + method=AggregationMethod.ONE_VS_ALL) +fig.suptitle("ONE_VS_ALL PR") +plt.show() +# plot one vs all precision recall curve for all classes +fig = stream_metrics.plot_roc_curve(class_names=iris.target_names, + method=AggregationMethod.ONE_VS_ALL) +fig.suptitle("ONE_VS_ALL ROC") +plt.show() + +# plot one vs all precision recall curve for specific class +fig = stream_metrics.plot_precision_recall_curve(class_names=iris.target_names, + method=AggregationMethod.ONE_VS_ALL, + class_index=0) +fig.suptitle("ONE_VS_ALL but only for CLASS 0") +plt.show() + + +# plot micro averaged precision recall +fig = stream_metrics.plot_precision_recall_curve(class_names=iris.target_names, + method=AggregationMethod.MICRO) +fig.suptitle("MICRO AVERAGED PR CURVE") +plt.show() + +# plot Macro averaged precision recall +fig = stream_metrics.plot_precision_recall_curve(class_names=iris.target_names, + method=AggregationMethod.MACRO) +fig.suptitle("MACRO AVERAGED PR CURVE") +plt.show() + diff --git a/gen_ref_pages.py b/gen_ref_pages.py new file mode 100644 index 0000000..edb65db --- /dev/null +++ b/gen_ref_pages.py @@ -0,0 +1,26 @@ +"""Generate the code reference pages.""" + +from pathlib import Path + +import mkdocs_gen_files + +root = Path(__file__).parent.parent +src = root / "streamauc" + +for path in sorted(src.rglob("*.py")): + module_path = path.relative_to(src).with_suffix("") + doc_path = path.relative_to(src).with_suffix(".md") + full_doc_path = Path("reference", doc_path) + + parts = tuple(module_path.parts) + + if parts[-1] == "__init__": + parts = parts[:-1] + elif parts[-1] == "__main__": + continue + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + identifier = ".".join(parts) + print("::: " + identifier, file=fd) + + mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 563fef2..f4ec486 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,2 +1,8 @@ site_name: streamauc theme: readthedocs + +plugins: +- search +- gen-files: + scripts: + - gen_ref_pages.py \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index 11ba2ba..e83d40c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -8,3 +8,8 @@ pytest-cov mypy gitchangelog mkdocs +mkdocstrings[python] +mkdocs-gen-files +scikit-learn +ipdb +tqdm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b05f2a6..806f221 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ -# This template is a low-dependency template. -# By default there is no requirements added here. -# Add the requirements you need to this file. -# or run `make init` to create this file automatically based on the template. -# You can also run `make switch-to-poetry` to use the poetry package manager. +numpy +matplotlib \ No newline at end of file diff --git a/streamauc/__init__.py b/streamauc/__init__.py index e69de29..48630bb 100644 --- a/streamauc/__init__.py +++ b/streamauc/__init__.py @@ -0,0 +1,5 @@ +# from .metrics import * +from .streaming_metrics import StreamingMetrics +from .utils import AggregationMethod, auc + +__all__ = ["StreamingMetrics", "AggregationMethod", "auc"] diff --git a/streamauc/__main__.py b/streamauc/__main__.py deleted file mode 100644 index 3e77994..0000000 --- a/streamauc/__main__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Entry point for streamauc.""" - -from streamauc.cli import main # pragma: no cover - -if __name__ == "__main__": # pragma: no cover - main() diff --git a/streamauc/base.py b/streamauc/base.py deleted file mode 100644 index c8c3581..0000000 --- a/streamauc/base.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -streamauc base module. - -This is the principal module of the streamauc project. -here you put your main classes and objects. - -Be creative! do whatever you want! - -If you want to replace this with a Flask application run: - - $ make init - -and then choose `flask` as template. -""" - -# example constant variable -NAME = "streamauc" diff --git a/streamauc/cli.py b/streamauc/cli.py deleted file mode 100644 index 969287a..0000000 --- a/streamauc/cli.py +++ /dev/null @@ -1,28 +0,0 @@ -"""CLI interface for streamauc project. - -Be creative! do whatever you want! - -- Install click or typer and create a CLI app -- Use builtin argparse -- Start a web application -- Import things from your .base module -""" - - -def main(): # pragma: no cover - """ - The main function executes on commands: - `python -m streamauc` and `$ streamauc `. - - This is your program's entry point. - - You can change this function to do whatever you want. - Examples: - * Run a test suite - * Run a server - * Do some other stuff - * Run a command line application (Click, Typer, ArgParse) - * List all available tasks - * Run an application (Flask, FastAPI, Django, etc.) - """ - print("This will do something") diff --git a/streamauc/documentation.md b/streamauc/documentation.md new file mode 100644 index 0000000..ac9c2d9 --- /dev/null +++ b/streamauc/documentation.md @@ -0,0 +1,18 @@ + +Install +---- + +Examples +---------------------------- +You can find some basic examples for the usage of this library in `examples/toy_2d.py` and `examples/conditional_toy_2d.py`. + + + +License +------- + +This package is licensed under the [Apache License Version 2.0] +(https://www.apache.org/licenses/LICENSE-2.0.html). + +Copyright (c) 2023 Fabricio Arend Torres +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/streamauc/metrics/__init__.py b/streamauc/metrics/__init__.py new file mode 100644 index 0000000..722c1c0 --- /dev/null +++ b/streamauc/metrics/__init__.py @@ -0,0 +1,33 @@ +from streamauc.metrics._f1_score import f1_score +from streamauc.metrics._fpr import fpr +from streamauc.metrics._jaccard_index import jaccard_index +from streamauc.metrics._precision import precision +from streamauc.metrics._tnr import tnr +from streamauc.metrics._tpr import tpr +from streamauc.metrics.metric_synonyms import ( + dice, + fallout, + hit_rate, + positive_predictive_value, + recall, + selectivity, + sensitivity, + specificity, +) + +__all__ = [ + "f1_score", + "fpr", + "jaccard_index", + "precision", + "tnr", + "tpr", + "dice", + "fallout", + "hit_rate", + "positive_predictive_value", + "recall", + "selectivity", + "sensitivity", + "specificity", +] diff --git a/streamauc/metrics/_accuracy.py b/streamauc/metrics/_accuracy.py new file mode 100644 index 0000000..f53b03a --- /dev/null +++ b/streamauc/metrics/_accuracy.py @@ -0,0 +1,85 @@ +from typing import Optional + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["accuracy"] + + +def accuracy( + tp: np.ndarray, + tn: np.ndarray, + fp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + if method == AggregationMethod.MICRO: + _acc = accuracy_micro( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=check_inputs + ) + elif method == AggregationMethod.MACRO: + _acc = accuracy_macro( + tp=tp, fn=fn, fp=fp, tn=tn, check_inputs=check_inputs + ) + elif method == AggregationMethod.ONE_VS_ALL: + _acc = accuracy_onevsall( + tp=tp, fn=fn, fp=fp, tn=tn, check_inputs=check_inputs + )[..., class_index] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _acc + + +def accuracy_onevsall( + tp: np.ndarray, + tn: np.ndarray, + fn: np.ndarray, + fp: np.ndarray, + check_inputs: bool = True, +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + _total = tp + tn + fp + fn + _accuracy = (tp + tn) / (_total + 1e-12) + return _accuracy + + +def accuracy_micro( + tp: np.ndarray, + tn: np.ndarray, + fn: np.ndarray, + fp: np.ndarray, + check_inputs: bool = True, +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + tp_sum = np.sum(tp, axis=-1) + tn_sum = np.sum(tn, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + + return accuracy_onevsall( + tp=tp_sum, tn=tn_sum, fn=fn_sum, fp=fp_sum, check_inputs=check_inputs + ) + + +def accuracy_macro( + tp: np.ndarray, + tn: np.ndarray, + fn: np.ndarray, + fp: np.ndarray, + check_inputs: bool = True, +): + _f1_onevsall = accuracy_onevsall( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=check_inputs + ) + return np.mean(_f1_onevsall, axis=-1) diff --git a/streamauc/metrics/_f1_score.py b/streamauc/metrics/_f1_score.py new file mode 100644 index 0000000..003aace --- /dev/null +++ b/streamauc/metrics/_f1_score.py @@ -0,0 +1,69 @@ +from typing import Optional + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["f1_score"] + + +def f1_score( + tp: np.ndarray, + fp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + if method == AggregationMethod.MICRO: + _f1 = f1_micro(tp=tp, fn=fn, fp=fp, check_inputs=check_inputs) + elif method == AggregationMethod.MACRO: + _f1 = f1_macro(tp=tp, fn=fn, fp=fp, check_inputs=check_inputs) + elif method == AggregationMethod.ONE_VS_ALL: + _f1 = f1_onevsall(tp=tp, fn=fn, fp=fp, check_inputs=check_inputs)[ + ..., class_index + ] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _f1 + + +def f1_onevsall( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + _f1 = (2 * tp) / (2 * tp + fp + fn + 1e-12) + # _f1 = np.divide( + # 2 * tp, + # (2 * tp + fp + fn), + # where=(2 * tp + fp + fn) != 0, + # ) + return _f1 + + +def f1_micro( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + tp_sum = np.sum(tp, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + + return f1_onevsall( + tp=tp_sum, fn=fn_sum, fp=fp_sum, check_inputs=check_inputs + ) + + +def f1_macro( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + _f1_onevsall = f1_onevsall(tp=tp, fn=fn, fp=fp, check_inputs=check_inputs) + return np.mean(_f1_onevsall, axis=-1) diff --git a/streamauc/metrics/_fnr.py b/streamauc/metrics/_fnr.py new file mode 100644 index 0000000..2e7bf14 --- /dev/null +++ b/streamauc/metrics/_fnr.py @@ -0,0 +1,74 @@ +from typing import Optional + +import numpy as np + +from streamauc.metrics import tpr +from streamauc.utils import AggregationMethod + +__all__ = ["fnr"] + + +def fnr( + tp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs +): + """ + Compute the false negative rate (FNR) given the true positive (tp) and + false negative (fn) predictions at various thresholds. + Can be used as a Callable for the auc method. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fn : np.ndarray + Array of false negatives for each class. + Of shape [num_thresholds, num_classes] + method : AggregationMethod, optional + Aggregation method to be used in multiclass setting. + Default is AggregationMethod.MACRO. + class_index : int, optional + Class index for "one_vs_all" calculation. Required if `method` + is "one_vs_all". + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + **kwargs + Additional keyword arguments. + + Returns + ------- + fnr : np.ndarray + FNR for the specified class across different samples. + Of shape [num_thresholds] + + Raises + ------ + ValueError + If an invalid aggregation method is specified. + + Notes + ----- + - For micro-averaging: + $$ \\text{FNR}_{\\text{micro}} = 1 - \\frac{\\sum \\text{TP}}{ + \\sum (\\text{TP} + \\text{FN})} $$ + - For macro-averaging: + $$ \\text{FNR}_{\\text{macro}} = 1 - \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{TP}_c}{\\text{TP}_c + \\text{FN}_c} $$ + - For one-vs-all: + $$ \\text{FNR}_{\\text{one\\_vs\\_all}} = 1 - + \\frac{\\text{TP}_{c}}{\\text{TP}_{c} + \\text{FN}_{c}} $$ + where $ c $ is the specified class index. + """ + return 1 - tpr( + tp=tp, + fn=fn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + **kwargs + ) diff --git a/streamauc/metrics/_fpr.py b/streamauc/metrics/_fpr.py new file mode 100644 index 0000000..09cf1a4 --- /dev/null +++ b/streamauc/metrics/_fpr.py @@ -0,0 +1,200 @@ +from typing import Optional + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["fpr"] + + +def fpr( + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +) -> np.ndarray: + """ + Compute the false positive rate (FPR) given the false positive (fp) and + true negative (tn) predictions at various thresholds. + Can be used as a Callable for the auc method. + + Parameters + ---------- + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + tn : np.ndarray + Array of true negatives for each class. + Of shape [num_thresholds, num_classes] + method : AggregationMethod + Aggregation method to be used in multiclass setting. + Default is AggregationMethod.MACRO. + class_index : int, optional + Class index for "one_vs_all" calculation. Required if `method` + is "one_vs_all". + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + **kwargs + Additional keyword arguments. + + Returns + ------- + fpr : np.ndarray + FPR for the specified class across different samples. + Of shape [num_thresholds] + + Raises + ------ + ValueError + If an invalid aggregation method is specified. + + Notes + ----- + - For micro-averaging: + $$ \\text{FPR}_{\\text{micro}} = + \\frac{\\sum \\text{FP}}{\\sum (\\text{FP} + \\text{TN})} $$ + - For macro-averaging: + $$ \\text{FPR}_{\\text{macro}} = + \\frac{1}{C} \\sum_{c=1}^{C} \\frac{\\text{FP}_c}{\\text{FP}_c + + \\text{TN}_c} $$ + - For one-vs-all: + $$ \\text{FPR}_{\\text{one\\_vs\\_all}} = + \\frac{\\text{FP}_{c}}{\\text{FP}_{c} + \\text{TN}_{c}} $$ + where $ c $ is the specified class index. + """ + if method == AggregationMethod.MICRO: + _fpr = fpr_micro(fp=fp, tn=tn, check_inputs=check_inputs) + elif method == AggregationMethod.MACRO: + _fpr = fpr_macro(fp=fp, tn=tn, check_inputs=check_inputs) + elif method == AggregationMethod.ONE_VS_ALL: + _fpr = fpr_onevsall(fp=fp, tn=tn, check_inputs=check_inputs)[ + ..., class_index + ] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _fpr + + +def fpr_macro( + fp: np.ndarray, tn: np.ndarray, check_inputs: bool = True +) -> np.ndarray: + """ + Compute macro-averaged false-positive rate (FPR) for multi-class + classification. + + Parameters + ---------- + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + tn : np.ndarray + Array of true negatives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + fpr : np.ndarray + Macro-averaged false positive rate for each class. + Of shape [num_thresholds] + + Notes + ----- + - FPR (False Positive Rate) is calculated as the mean of individual + class FPRs: + $$ \\text{FPR} = \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{FP}_c}{\\text{FP}_c + \\text{TN}_c} $$ + """ + + if check_inputs: + check_confusion_matrix_entries(fp, tn) + + _fpr = fpr_onevsall(fp=fp, tn=tn, check_inputs=check_inputs).mean(axis=-1) + return _fpr + + +def fpr_micro( + fp: np.ndarray, tn: np.ndarray, check_inputs: bool = True +) -> np.ndarray: + """ + Compute micro-averaged false positive rate (FPR) + for multi-class classification. + + Parameters + ---------- + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + tn : np.ndarray + Array of true negatives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + fpr : np.ndarray + Micro-averaged false positive rate for each class. + Of shape [num_thresholds] + + Notes + ----- + - FPR (False Positive Rate) is calculated as the ratio of false positives + to the sum of false positives and true negatives: + $$ \\text{FPR} = \\frac{\\sum_i \\text{FP}_i}{\\sum_i (\\text{FP}_i + + \\text{TN}_i)} $$ + """ + if check_inputs: + check_confusion_matrix_entries(fp, tn) + + fp_sum = np.sum(fp, axis=-1) + tn_sum = np.sum(tn, axis=-1) + + _fpr = fp_sum / (fp_sum + tn_sum + 1e-12) + return _fpr + + +def fpr_onevsall( + fp: np.ndarray, + tn: np.ndarray, + check_inputs: bool = True, +) -> np.ndarray: + """ + Compute TPR and FPR for a one-vs-all multi-class classification setup. + + Parameters + ---------- + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + tn : np.ndarray + Array of true negatives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + fpr : np.ndarray + False positive rate for each class across different samples. + Of shape [num_thresholds, num_classes] + + Notes + ----- + - FPR (False Positive Rate) is calculated as the ratio of false positives + to the + sum of false positives and true negatives for the specified class: + $$ \\text{FPR} = + \\frac{\\text{FP}_{c}}{\\text{FP}_{c} + \\text{TN}_{c}} $$ + """ + if check_inputs: + check_confusion_matrix_entries(fp, tn) + + _fpr = fp / (fp + tn + 1e-12) + return _fpr diff --git a/streamauc/metrics/_jaccard_index.py b/streamauc/metrics/_jaccard_index.py new file mode 100644 index 0000000..f811d45 --- /dev/null +++ b/streamauc/metrics/_jaccard_index.py @@ -0,0 +1,70 @@ +from typing import Optional + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["jaccard_index"] + + +def jaccard_index( + tp: np.ndarray, + fp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + if method == AggregationMethod.MICRO: + _jaccard_index = jaccard_index_micro( + tp=tp, fn=fn, fp=fp, check_inputs=check_inputs + ) + elif method == AggregationMethod.MACRO: + _jaccard_index = jaccard_index_macro( + tp=tp, fn=fn, fp=fp, check_inputs=check_inputs + ) + elif method == AggregationMethod.ONE_VS_ALL: + _jaccard_index = jaccard_index_onevsall( + tp=tp, fn=fn, fp=fp, check_inputs=check_inputs + )[..., class_index] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _jaccard_index + + +def jaccard_index_onevsall( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + _jaccard = tp / (tp + fp + fn + 1e-12) + return _jaccard + + +def jaccard_index_micro( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + if check_inputs: + check_confusion_matrix_entries(fn, tp, fp) + + tp_sum = np.sum(tp, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + + return jaccard_index_onevsall( + tp=tp_sum, fn=fn_sum, fp=fp_sum, check_inputs=check_inputs + ) + + +def jaccard_index_macro( + tp: np.ndarray, fn: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + _f1_onevsall = jaccard_index_onevsall( + tp=tp, fn=fn, fp=fp, check_inputs=check_inputs + ) + return np.mean(_f1_onevsall, axis=-1) diff --git a/streamauc/metrics/_precision.py b/streamauc/metrics/_precision.py new file mode 100644 index 0000000..ab247a0 --- /dev/null +++ b/streamauc/metrics/_precision.py @@ -0,0 +1,195 @@ +from typing import Optional + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["precision"] + + +def precision( + tp: np.ndarray, + fp: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + """ + Compute precision for multi-class classification using the + specified aggregation method. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + fp : np.ndarray + Array of false positives for each class. + method : AggregationMethod, optional + Method used to compute precision for multiple classes. + Default is AggregationMethod.MACRO. + Must be one of ["macro", "micro", "one_vs_all"]. + class_index : int, optional + Class index for "one_vs_all" calculation. Required if `method` + is "one_vs_all". + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + **kwargs + Additional keyword arguments. + + Returns + ------- + precision : np.ndarray + Computed precision values based on the specified aggregation method. + + Raises + ------ + ValueError + If an invalid aggregation method is specified. + + Notes + ----- + - For micro-averaging: + $$ \\text{Precision}_{\\text{micro}} = + \\frac{\\sum \\text{TP}}{\\sum (\\text{TP} + \\text{FP})} $$ + - For macro-averaging: + $$ \\text{Precision}_{\\text{macro}} = + \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{TP}_c}{\\text{TP}_c + \\text{FP}_c} $$ + - For one-vs-all: + $$ \\text{Precision}_{\\text{one\\_vs\\_all}} = + \\frac{\\text{TP}_{c}}{\\text{TP}_{c} + \\text{FP}_{c}} $$ + where $ c $ is the specified class index. + """ + if method == AggregationMethod.MICRO: + precision = precision_micro(tp=tp, fp=fp, check_inputs=check_inputs) + elif method == AggregationMethod.MACRO: + precision = precision_macro(tp=tp, fp=fp, check_inputs=check_inputs) + elif method == AggregationMethod.ONE_VS_ALL: + precision = precision_onevsall( + tp=tp, fp=fp, check_inputs=check_inputs + )[..., class_index] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + + return precision + + +def precision_onevsall( + tp: np.ndarray, fp: np.ndarray, check_inputs: bool = True +): + """ + Compute precision and recall for a one-vs-all multi-class classification + setup for all classes. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + class_index : int + Index of the class for which to compute precision and recall. + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + precision : np.ndarray + Precision for the specified class across different samples. + Of shape [num_thresholds, num_classes] + + Notes + ----- + - Precision is calculated as the ratio of true positives to the sum of true + positives and false positives for the specified class: + $$ \\text{Precision} = \\frac{\\text{TP}_{c}}{\\text{TP}_{c} + + \\text{FP}_{c}} $$ + """ + if check_inputs: + check_confusion_matrix_entries(fp, tp) + + precision = tp / (tp + fp + 1e-10) + return precision + + +def precision_micro( + tp: np.ndarray, fp: np.ndarray, check_inputs: bool = True +) -> np.ndarray: + """ + Compute micro-averaged precision for multi-class classification. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each threshold and class. + Of shape [num_thresholds, num_classes] + fp : np.ndarray + Array of false positives for each threshold and class. + Of shape [num_thresholds, num_classes] + check_inputs : bool + Whether to check inputs with asserts. + + Returns + ------- + precision : np.ndarray + Micro-averaged precision for each threshold. + Of shape [num_thresholds] + + Notes + ----- + - Precision is calculated as the ratio of true positives to the sum of true + positives and false positives: + $$ \\text{Precision} = \\frac{\\sum_i \\text{TP}_i}{\\sum_i (\\text{ + TP}_i + \\text{FP}_i)} $$ + """ + + if check_inputs: + check_confusion_matrix_entries(fp, tp) + + tp_sum = np.sum(tp, axis=-1) + fp_sum = np.sum(fp, axis=-1) + return precision_onevsall(tp=tp_sum, fp=fp_sum, check_inputs=check_inputs) + + +def precision_macro( + tp: np.ndarray, fp: np.ndarray, check_inputs: bool = True +) -> np.ndarray: + """ + Compute macro-averaged precision and recall for multi-class classification. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + precision : np.ndarray + Macro-averaged precision for each class. + Of shape [num_thresholds] + + Notes + ----- + - Precision is calculated as the mean of individual class precisions: + $$ \\text{Precision} = \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{TP}_c}{\\text{TP}_c + \\text{FP}_c} $$ + """ + if check_inputs: + check_confusion_matrix_entries(fp, tp) + + precision = precision_onevsall( + tp=tp, fp=fp, check_inputs=check_inputs + ).mean(axis=-1) + return precision diff --git a/streamauc/metrics/_tnr.py b/streamauc/metrics/_tnr.py new file mode 100644 index 0000000..31b23f5 --- /dev/null +++ b/streamauc/metrics/_tnr.py @@ -0,0 +1,75 @@ +from typing import Optional + +import numpy as np + +from streamauc.metrics._fpr import fpr +from streamauc.utils import AggregationMethod + +__all__ = ["tnr"] + + +def tnr( + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +) -> np.ndarray: + """ + Compute the true negative rate (TNR) given the false positive (fp) and + true negative (tn) predictions at various thresholds. + Can be used as a Callable for the auc method. + + Parameters + ---------- + fp : np.ndarray + Array of false positives for each class. + Of shape [num_thresholds, num_classes] + tn : np.ndarray + Array of true negatives for each class. + Of shape [num_thresholds, num_classes] + method : AggregationMethod, optional + Aggregation method to be used in multiclass setting. + Default is AggregationMethod.MACRO. + class_index : int, optional + Class index for "one_vs_all" calculation. Required if `method` + is "one_vs_all". + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + **kwargs + Additional keyword arguments. + + Returns + ------- + tnr : np.ndarray + TNR for the specified class across different samples. + Of shape [num_thresholds] + + Raises + ------ + ValueError + If an invalid aggregation method is specified. + + Notes + ----- + - For micro-averaging: + $$ \\text{TNR}_{\\text{micro}} = 1 - + \\frac{\\sum \\text{FP}}{\\sum (\\text{FP} + \\text{TN})} $$ + - For macro-averaging: + $$ \\text{TNR}_{\\text{macro}} = 1 - + \\frac{1}{C} \\sum_{c=1}^{C} \\frac{\\text{FP}_c}{\\text{FP}_c + + \\text{TN}_c} $$ + - For one-vs-all: + $$ \\text{TNR}_{\\text{one\\_vs\\_all}} = 1 - + \\frac{\\text{FP}_{c}}{\\text{FP}_{c} + \\text{TN}_{c}} $$ + where $ c $ is the specified class index. + """ + return 1 - fpr( + fp=fp, + tn=tn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + **kwargs, + ) diff --git a/streamauc/metrics/_tpr.py b/streamauc/metrics/_tpr.py new file mode 100644 index 0000000..addcb87 --- /dev/null +++ b/streamauc/metrics/_tpr.py @@ -0,0 +1,194 @@ +from typing import Optional, Tuple + +import numpy as np + +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +__all__ = ["tpr"] + + +def tpr( + tp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + """ + Compute the true positive rate (TPR) given the true positive (tp) and + false negative (fn) predictions at various thresholds. + Can be used as a Callable for the auc method. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fn : np.ndarray + Array of false negatives for each class. + Of shape [num_thresholds, num_classes] + method : AggregationMethod + Aggregation method to be used in multiclass setting. + Default is AggregationMethod.MACRO. + class_index : int, optional + Class index for "one_vs_all" calculation. Required if `method` + is "one_vs_all". + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + **kwargs + Additional keyword arguments. + + Returns + ------- + tpr : np.ndarray + TPR for the specified class across different samples. + Of shape [num_thresholds] + + Raises + ------ + ValueError + If an invalid aggregation method is specified. + + Notes + ----- + - For micro-averaging: + $$ \\text{TPR}_{\\text{micro}} = \\frac{\\sum \\text{TP}}{ + \\sum (\\text{TP} + \\text{FN})} $$ + - For macro-averaging: + $$ \\text{TPR}_{\\text{macro}} = \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{TP}_c}{\\text{TP}_c + \\text{FN}_c} $$ + - For one-vs-all: + $$ \\text{TPR}_{\\text{one\\_vs\\_all}} = \\frac{\\text{TP}_{c}}{ + \\text{TP}_{c} + \\text{FN}_{c}} $$ + where $ c $ is the specified class index. + """ + + if method == AggregationMethod.MICRO: + _tpr = tpr_micro(tp=tp, fn=fn, check_inputs=check_inputs) + elif method == AggregationMethod.MACRO: + _tpr = tpr_macro(tp=tp, fn=fn, check_inputs=check_inputs) + elif method == AggregationMethod.ONE_VS_ALL: + _tpr = tpr_onevsall(tp=tp, fn=fn, check_inputs=check_inputs)[ + ..., class_index + ] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _tpr + + +def tpr_onevsall(tp: np.ndarray, fn: np.ndarray, check_inputs: bool = True): + """ + Compute the true positive rate (TPR) for a one-vs-all multi-class + classification setup for all classes. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fn : np.ndarray + Array of false negatives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + tpr : np.ndarray + TPR for the specified class across different samples. + Of shape [num_thresholds, num_classes] + + Notes + ----- + - TPR is calculated as the ratio of true positives to the sum of true + positives and false negatives for the specified class: + $$ \\text{TPR} = \\frac{\\text{TP}_{c}}{\\text{TP}_{c} + + \\text{FN}_{c}} $$ + """ + if check_inputs: + check_confusion_matrix_entries(fn, tp) + + _tpr = tp / (tp + fn + 1e-12) + return _tpr + + +def tpr_micro( + tp: np.ndarray, fn: np.ndarray, check_inputs: bool = True +) -> np.ndarray: + """ + Compute micro-averaged recall for multi-class classification. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each threshold and class. + Of shape [num_thresholds, num_classes] + fn : np.ndarray + Array of false negatives for each threshold and class. + Of shape [num_thresholds, num_classes] + check_inputs : bool + Whether to check inputs with asserts. + + Returns + ------- + recall : np.ndarray + Micro-averaged recall for each threshold. + Of shape [num_thresholds] + + Notes + ----- + - Recall is calculated as the ratio of true positives to the sum of true + positives and false negatives: + $$ \\text{Recall} = \\frac{\\sum_i \\text{TP}_i}{\\sum_i + (\\text{TP}_i + \\text{FN}_i)} $$ + """ + + if check_inputs: + check_confusion_matrix_entries(fn, tp) + + tp_sum = np.sum(tp, axis=-1) + fn_sum = np.sum(fn, axis=-1) + _tpr = tp_sum / (tp_sum + fn_sum + 1e-12) + return _tpr + + +def tpr_macro( + tp: np.ndarray, fn: np.ndarray, check_inputs: bool = True +) -> Tuple[np.ndarray, np.ndarray]: + """ + Compute macro-averaged TPR for multi-class classification. + + Parameters + ---------- + tp : np.ndarray + Array of true positives for each class. + Of shape [num_thresholds, num_classes] + fn : np.ndarray + Array of false negatives for each class. + Of shape [num_thresholds, num_classes] + check_inputs : bool, optional + If True, perform input validation checks. Default is True. + + Returns + ------- + tpr : np.ndarray + Macro-averaged true positive rate for each class. + Of shape [num_thresholds] + + Notes + ----- + - TPR (True Positive Rate) is calculated as the mean of individual + class TPRs: + $$ \\text{TPR} = \\frac{1}{C} \\sum_{c=1}^{C} + \\frac{\\text{TP}_c}{\\text{TP}_c + \\text{FN}_c} $$ + """ + + if check_inputs: + check_confusion_matrix_entries(tp, fn) + + _tpr = (tp / (tp + fn + 1e-12)).mean(axis=-1) + return _tpr diff --git a/streamauc/metrics/metric_synonyms.py b/streamauc/metrics/metric_synonyms.py new file mode 100644 index 0000000..57021c8 --- /dev/null +++ b/streamauc/metrics/metric_synonyms.py @@ -0,0 +1,132 @@ +# pragma: no cover +from typing import Optional + +import numpy as np + +from streamauc.metrics._f1_score import f1_score +from streamauc.metrics._fpr import fpr +from streamauc.metrics._precision import precision +from streamauc.metrics._tnr import tnr +from streamauc.metrics._tpr import tpr +from streamauc.utils import AggregationMethod, copy_docstring_from + +__all__ = [ + "recall", + "sensitivity", + "specificity", + "selectivity", + "positive_predictive_value", + "dice", + "hit_rate", + "fallout", +] + + +# ###### TPR +@copy_docstring_from(tpr) +def recall( + tp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return tpr(tp, fn, method, class_index, check_inputs, **kwargs) + + +@copy_docstring_from(tpr) +def sensitivity( + tp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return tpr(tp, fn, method, class_index, check_inputs, **kwargs) + + +@copy_docstring_from(tpr) +def hit_rate( + tp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return tpr(tp, fn, method, class_index, check_inputs, **kwargs) + + +# ###### fpr +@copy_docstring_from(fpr) +def fallout( + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return fpr(fp, tn, method, class_index, check_inputs, **kwargs) + + +# ###### precision +@copy_docstring_from(precision) +def positive_predictive_value( + tp: np.ndarray, + fp: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + precision(tp, fp, method, class_index, check_inputs, **kwargs) + + +# ###### tnr +@copy_docstring_from(tnr) +def specificity( + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return tnr(fp, tn, method, class_index, check_inputs, **kwargs) + + +@copy_docstring_from(tnr) +def selectivity( + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return tnr(fp, tn, method, class_index, check_inputs, **kwargs) + + +# ###### f1 +@copy_docstring_from(f1_score) +def dice( + tp: np.ndarray, + fp: np.ndarray, + fn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, + **kwargs, +): + return f1_score( + tp=tp, + fp=fp, + fn=fn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + **kwargs, + ) diff --git a/streamauc/plot_util.py b/streamauc/plot_util.py new file mode 100644 index 0000000..743f90f --- /dev/null +++ b/streamauc/plot_util.py @@ -0,0 +1,131 @@ +import math +from typing import List, Optional, Tuple + +import numpy as np +from matplotlib import pyplot as plt +from matplotlib.cm import ScalarMappable +from matplotlib.collections import LineCollection + +from streamauc.utils import auc + + +def create_square_subplots( + num_subplots, +) -> Tuple[plt.Figure, List[plt.Axes]]: # pragma: nocover + + if num_subplots <= 0: + raise ValueError("Must be greater 0...") + + # Calculate the number of rows and columns + num_cols = math.ceil(math.sqrt(num_subplots)) + num_rows = math.ceil(num_subplots / num_cols) + + # Create the subplots + fig, axs = plt.subplots( + num_rows, num_cols, figsize=(num_cols * 5, num_rows * 5) + ) + + # Flatten the axs array for easy iteration, + # even if there is only one row/column + if num_subplots == 1: + axs = [axs] + else: + axs = axs.flatten() + + # Hide any unused subplots + for i in range(num_subplots, num_rows * num_cols): + fig.delaxes(axs[i]) + + return fig, fig.axes + + +def plot_curve_and_auc( + x_values: np.ndarray, + y_values: np.ndarray, + thresholds: np.ndarray, + class_names: Optional[List[str]] = None, + cmap="viridis", +) -> plt.Figure: + if x_values.ndim == 1: + + fig, axs = create_square_subplots(1) + ax = axs[0] + + # Normalization for color mapping + norm = plt.Normalize(vmin=min(thresholds), vmax=max(thresholds)) + cmap = plt.get_cmap(cmap) + + _auc = auc(x_values, y_values) + points = np.array([x_values, y_values]).T.reshape(-1, 1, 2) + segments = np.concatenate([points[:-1], points[1:]], axis=1) + lc = LineCollection(segments, cmap=cmap, norm=norm) + lc.set_array(thresholds) + lc.set_linewidth(2) + ax.add_collection(lc) + ax.scatter(x_values, y_values, color="black", marker="+", alpha=0.1) + + if class_names is None: + ax.set_title(f"AUC={_auc:.3f}") + else: + ax.set_title(f"{class_names[0]} | AUC={_auc:.3f}") + + ax.set_ylabel("TPR") + ax.set_xlabel("FPR") + return fig + + elif x_values.ndim == 2: + num_classes = y_values.shape[-1] + + if class_names is not None and len(class_names) != num_classes: + raise ValueError( + "Class names must be of same length as the " + "fpr.shape[-1] and tpr.shape[-1]." + ) + elif class_names is None: + class_names = [f"Class_{i}" for i in range(num_classes)] + + fig, axs = create_square_subplots(num_classes) + # Normalization for color mapping + norm = plt.Normalize(vmin=min(thresholds), vmax=max(thresholds)) + cmap = plt.get_cmap("viridis") + + for ax, i in zip(axs, range(num_classes)): + _auc = auc(x_values[:, i], y_values[:, i]) + points = np.array([x_values[:, i], y_values[:, i]]).T.reshape( + -1, 1, 2 + ) + segments = np.concatenate([points[:-1], points[1:]], axis=1) + lc = LineCollection(segments, cmap=cmap, norm=norm) + lc.set_array(thresholds) + lc.set_linewidth(2) + ax.add_collection(lc) + ax.scatter( + x_values[:, i], + y_values[:, i], + color="black", + marker="+", + alpha=0.1, + ) + ax.set_title(f"{class_names[i]} | AUC={_auc:.3f}") + + ax.set_ylabel("TPR") + ax.set_xlabel("FPR") + + # Create a ScalarMappable and add a color bar + sm = ScalarMappable(cmap=cmap, norm=norm) + sm.set_array([]) # Only needed for the color bar + + # Add color bar to the figure + fig.colorbar( + sm, + ax=axs, + orientation="horizontal", + fraction=0.02, + pad=0.1, + aspect=30, + label="Threshold", + ) + fig.subplots_adjust(hspace=0.4, wspace=0.4, bottom=0.2) + return fig + else: + raise NotImplementedError("...") diff --git a/streamauc/streaming_metrics.py b/streamauc/streaming_metrics.py new file mode 100644 index 0000000..5f413f6 --- /dev/null +++ b/streamauc/streaming_metrics.py @@ -0,0 +1,502 @@ +from typing import Callable, List, Optional, Tuple, Union + +import matplotlib.pyplot as plt +import numpy as np + +import streamauc.metrics as metrics +from streamauc.plot_util import plot_curve_and_auc +from streamauc.utils import AggregationMethod, auc + +__all__ = ["StreamingMetrics"] + + +def _validate_thresholds( + num_thresholds: int, + thresholds: Optional[Union[List[float], np.ndarray]] = None, +) -> Tuple[int, np.ndarray]: + if thresholds is not None: + if ( + np.min(thresholds) < 0 + or np.max(thresholds) > 1 + or len(thresholds) <= 1 + ): + raise ValueError( + f"Values must be in range [0., 1.]," + f" found values in range " + f"[{np.min(thresholds)}, {np.max(thresholds)}]" + ) + thresholds = np.sort(thresholds) + num_thresholds = len(thresholds) + 2 + else: + if num_thresholds <= 1: + raise ValueError( + "Argument `num_thresholds` must be an integer > 1. " + f"Received: num_thresholds={num_thresholds}" + ) + thresholds = np.linspace(0, 1, num_thresholds - 2) + + # Add endpoints slightly below 0 and above 1 to account for floating point + # imprecisions + epsilon = np.finfo(float).eps + thresholds = np.concatenate( + ([0.0 - epsilon], thresholds, [1.0 + epsilon]) + )[::-1] + + return num_thresholds, thresholds + + +class StreamingMetrics: + """ + Class for computing metrics in a minibatch-wise, iterative, fashion. + + Parameters + ---------- + num_thresholds : int, optional + Number of thresholds to evaluate the curve. Default is 200. + curve_type : str, optional + Type of curve to compute, either "ROC" or "PR". Default is "PR". + num_classes : int + Number of classes in the multiclass setting. Must be >= 2. + thresholds : list of float, optional + List of specific thresholds to evaluate the curve. + + """ + + def __init__( + self, + num_thresholds: int = 200, + num_classes: int = 2, + thresholds: Optional[Union[List[float], np.ndarray]] = None, + ): + if num_classes < 2: + raise ValueError("Argument `num_classes` must be an integer >= 2.") + + self.num_classes = num_classes + + self.num_thresholds, self.thresholds = _validate_thresholds( + num_thresholds, thresholds + ) + + self._confusion_matrix = np.zeros( + (self.num_thresholds, self.num_classes, 2, 2), + dtype=int, + ) + + @property + def confusion_matrix(self) -> np.ndarray: + """ + For each threshold, and for each class, there is a 2x2 confusion + matrix. The entries of each confusion matrix correspond to the + labels of: + np.array([ ["TP","FN"], + ["FP", "TN"] + ]) + + That is, the indices are given by: + TP: self.confusion_matrix[..., 0,0] + FP: self.confusion_matrix[..., 1,0] + FN: self.confusion_matrix[..., 0,1] + TN: self.confusion_matrix[..., 1,1] + + Note, that this corresponds to the flipped confusion matrix of + sklearn. We prefer this order, as it corresponds to many written + references, e.g. the wikipedia page. + + That is, self.confusion_matrix = np.flip(sklearn_confusion_matrix) + + """ + return self._confusion_matrix + + def reset(self): + """ + Reset the intermediate values for the confusion matrix. + """ + self._confusion_matrix = np.zeros( + (self.num_thresholds, self.num_classes, 2, 2), + dtype=int, + ) + + def update( + self, + y_true: np.ndarray, + y_score: np.ndarray, + check_inputs: bool = True, + ): + """ + Update the intermediate values based on streaming data. + + Parameters + ---------- + y_true : np.ndarray + Ground truth labels of shape [-1] (or [-1, 1], [-1, 1, 1, 1]...) + with values indicating the class index. Alternatively, may also + be one-hot encoded labels of shape [-1, num_classes]. + y_score : np.ndarray + Predicted probabilities for each class of shape[-1, num_classes] + + Raises + ------ + ValueError + If the shapes of `y_true` and `y_pred` do not match. + """ + + y_true = np.squeeze(y_true) + y_score = np.squeeze(y_score) + + if check_inputs: + if y_true.ndim > 2: + raise ValueError( + f"Unknown shape of y_true: {y_true.shape}," + f"must be squeezable to either [-1, num_classes] or [-1]." + ) + if y_score.ndim > 2: + raise ValueError( + f"Unknown shape of y_true: {y_true.shape}," + f"must be squeezable to either [-1, num_classes] or [-1]." + ) + + if not (y_true.shape[0] == y_score.shape[0]): + raise ValueError( + "Number of samples in y_true and y_pred must match" + ) + + if (y_score.ndim == 2) and (y_score.shape[1] != self.num_classes): + raise ValueError(f"Invalid shape of y_pred: {y_score.shape}") + + if y_true.ndim == 2 and y_true.shape[1] == self.num_classes: + y_true_argmax = np.argmax(y_true, -1) + else: + y_true_argmax = y_true + + for threshold_idx, threshold in enumerate(self.thresholds): + for class_idx in range(self.num_classes): + pred_pos = y_score[:, class_idx] >= threshold + is_pos = y_true_argmax == class_idx + + tp = np.sum(pred_pos & is_pos) + fp = np.sum(pred_pos & (~is_pos)) + fn = np.sum((~pred_pos) & (is_pos)) + tn = np.sum((~pred_pos) & (~is_pos)) + + self._confusion_matrix[threshold_idx, class_idx, 0, 0] += tp + self._confusion_matrix[threshold_idx, class_idx, 1, 0] += fp + self._confusion_matrix[threshold_idx, class_idx, 1, 1] += tn + self._confusion_matrix[threshold_idx, class_idx, 0, 1] += fn + + def _total(self) -> np.ndarray: + """ + Calculate total for each threshold and class. + Of course, this should be the same value for all thresholds. + + Returns + ------- + np.ndarray + Total at each threshold. + """ + total = self.confusion_matrix.sum(-1).sum(-1) + return total + + def true_positives(self) -> np.ndarray: + """ + Calculate true positives for each threshold and class. + + Returns + ------- + np.ndarray + True positives at each threshold. + """ + + return self.confusion_matrix[..., 0, 0] + # tp = np.diagonal(self._confusion_matrix, axis1=1, axis2=2) + # return tp + + def false_positives(self) -> np.ndarray: + """ + Calculate false positives for each threshold and class. + + Returns + ------- + np.ndarray + False positives at each threshold. + """ + # tp = self.true_positives() + # pp = self.predicted_positives() + # + # fp = pp - tp + # return fp + + return self.confusion_matrix[..., 1, 0] + + def true_negatives(self) -> np.ndarray: + """ + Calculate the total negatives for each threshold and class. + + Returns + ------- + np.ndarray + Negatives at each threshold. + """ + + return self.confusion_matrix[..., 1, 1] + + def false_negatives(self) -> np.ndarray: + """ + Calculate false negatives for each threshold and class. + + Returns + ------- + np.ndarray + False negatives at each threshold. + """ + # tp = self.true_positives() + # p = self.positives() + # + # fn = p - tp + return self.confusion_matrix[..., 0, 1] + + def positives(self) -> np.ndarray: + """ + Calculate the total positives for each threshold and class. + + Returns + ------- + np.ndarray + Positives at each threshold. + """ + # return np.sum(self._confusion_matrix, axis=-1) + return ( + self.confusion_matrix[..., 0, 0] + self.confusion_matrix[..., 0, 1] + ) + + def negatives(self) -> np.ndarray: + """ + Calculate the total negatives for each threshold and class. + + Returns + ------- + np.ndarray + Negatives at each threshold. + """ + + return self._total() - self.positives() + + def predicted_positives(self): + """ + Calculate predicted positives for each threshold and class. + + Returns + ------- + np.ndarray + Predicted positives at each threshold. + """ + + return ( + self.confusion_matrix[..., 0, 0] + self.confusion_matrix[..., 1, 0] + ) + + def predicted_negatives(self) -> np.ndarray: + """ + Calculate predicted negatives for each threshold and class. + + Returns + ------- + np.ndarray + Predicted positives at each threshold. + """ + # + # pp = self.predicted_positives() + # total = self._total() + # + # pn = total - pp + # return pn + + return ( + self.confusion_matrix[..., 0, 1] + self.confusion_matrix[..., 1, 1] + ) + + def calc_metric( + self, + metric: Callable, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs=True, + ): + tp = self.true_positives() + fp = self.false_positives() + fn = self.false_negatives() + tn = self.true_negatives() + + return metric( + tp=tp, + fp=fp, + fn=fn, + tn=tn, + class_index=class_index, + method=method, + check_inputs=check_inputs, + ) + + def auc( + self, + metric_xaxis: Callable = metrics.fpr, + metric_yaxis: Callable = metrics.tpr, + method: AggregationMethod = AggregationMethod.ONE_VS_ALL, + class_index: Optional[int] = None, + check_inputs=True, + ): + + metric_args = dict( + tp=self.true_positives(), + fp=self.false_positives(), + fn=self.false_negatives(), + tn=self.true_negatives(), + class_index=class_index, + method=method, + check_inputs=check_inputs, + ) + + x_values = metric_xaxis(**metric_args) + y_values = metric_yaxis(**metric_args) + return auc(x_values, y_values) + + def precision_recall_curve( + self, + method: AggregationMethod = AggregationMethod.ONE_VS_ALL, + class_index: Optional[int] = None, + check_inputs: bool = True, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Compute precision and recall at all thresholds for plotting and auc + computation. We adopt the behaviour of sklearn, in that the + precision corresponding to a recall of 0 is 1. + + (Technically its undefined since its tp/(tp+fp) with tp=fp=0, + but the value of 1 serves for stable plotting.) + + Parameters + ---------- + method : AggregationMethod + Method used to compute precision and recall for multiple classes. + Micro and macro refer to the averaging method. + Macro computes the metric for each class, and then averages the + metrics. + If '1-vs-all' the index for the positive class has to be defined + in 'class_index'. All other classes will be summarized as the + negative class. + + Must be one of ["macro","micro","1-vs-all"]. + + class_index : int, optional + Class index for "1-vs-all" calculation. + Required if `method` is "1-vs-all". + + Returns + ------- + precision : np.ndarray + Precision values at each threshold. + recall : np.ndarray + Recall values at each threshold. + """ + tp = self.true_positives() + fp = self.false_positives() + fn = self.false_negatives() + + precision = metrics.precision( + tp=tp, + fp=fp, + method=method, + class_index=class_index, + check_inputs=check_inputs, + ) + + recall = metrics.recall( + tp=tp, + fn=fn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + ) + # ensure precision 1 at recall 0 + # precision[0] = 1.0 + return ( + precision[::-1][1:].squeeze(), + recall[::-1][1:].squeeze(), + self.thresholds[1:][::-1].squeeze(), + ) + + def roc_curve( + self, + method: AggregationMethod = AggregationMethod.ONE_VS_ALL, + class_index: Optional[int] = None, + check_inputs: bool = True, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + tp = self.true_positives() + fp = self.false_positives() + fn = self.false_negatives() + tn = self.true_negatives() + + _tpr = metrics.tpr( + tp=tp, + fn=fn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + ) + _fpr = metrics.fpr( + fp=fp, + tn=tn, + method=method, + class_index=class_index, + check_inputs=check_inputs, + ) + return _fpr.squeeze(), _tpr.squeeze(), self.thresholds.squeeze() + + def plot_roc_curve( + self, + class_names: Optional[List[str]] = None, + method=AggregationMethod.ONE_VS_ALL, + class_index: Optional[int] = None, + **kwargs, + ) -> plt.Figure: # + # pragma: nocover + fpr, tpr, thresholds = self.roc_curve( + method=method, + class_index=class_index, + ) + + if method != AggregationMethod.ONE_VS_ALL: + class_names = None + return plot_curve_and_auc( + x_values=fpr, + y_values=tpr, + thresholds=thresholds, + class_names=class_names, + **kwargs, + ) + + def plot_precision_recall_curve( + self, + class_names: Optional[List[str]] = None, + method=AggregationMethod.ONE_VS_ALL, + class_index: Optional[int] = None, + **kwargs, + ) -> plt.Figure: # + + # pragma: nocover + precision, recall, thresholds = self.precision_recall_curve( + method=method, + class_index=class_index, + ) + + if method != AggregationMethod.ONE_VS_ALL: + class_names = None + assert ( + class_index is None + ), "class_index is only usable for ONE_VS_ALL" + return plot_curve_and_auc( + x_values=recall, + y_values=precision, + thresholds=thresholds, + class_names=class_names, + **kwargs, + ) diff --git a/streamauc/utils.py b/streamauc/utils.py new file mode 100644 index 0000000..9432234 --- /dev/null +++ b/streamauc/utils.py @@ -0,0 +1,146 @@ +from enum import Enum +from typing import Iterable, Union + +import numpy as np + +__all__ = [ + "AggregationMethod", + "auc", + "copy_docstring_from", + "check_confusion_matrix_entries", +] + + +def copy_docstring_from(source): + """ + Decorator to copy the docstring from one function to another. + + Parameters + ---------- + source : function + The function from which to copy the docstring. + + Returns + ------- + function + The decorated function with the copied docstring. + """ + + def decorator(target): + target.__doc__ = source.__doc__ + return target + + return decorator + + +class AggregationMethod(Enum): + """ + Enumeration for specifying the method of aggregating metrics in + multi-class classification. + + Attributes: + ---------- + MICRO : str + Micro-averaging method, which aggregates contributions from all classes + to compute the average metric. + MACRO : str + Macro-averaging method, which computes the metric independently for + each class and then takes the average. + ONE_VS_ALL : str + One-vs-all method, which treats (subsequently) each class as the + positive class and all others as negative, computing metrics for + each class in this manner. + """ + + MICRO = "MICRO" + MACRO = "MACRO" + ONE_VS_ALL = "ONE_VS_ALL" + + +def _all_equal(iterable: Iterable): + """ + Check if all elements in the iterable are equal. + + Parameters: + ---------- + iterable : iterable + An iterable containing elements to be compared. + + Returns: + ------- + bool + True if all elements in the iterable are equal or if the + iterable is empty. + False otherwise. + + """ + iterator = iter(iterable) + try: + first = next(iterator) + except StopIteration: + return True + return all(element == first for element in iterator) + + +def check_confusion_matrix_entries(*args): + """ + Validate that input confusion matrix arrays are 2D, have the same shape, + and are non-negative. + + Parameters: + ---------- + *args : np.ndarray + Variable number of input arrays representing confusion matrix entries. + + Raises: + ------ + AssertionError + If input arrays are not 2D, do not have the same shape, or contain + negative values. + """ + assert _all_equal( + [arg.ndim for arg in args] + ), "Inputs arrays must be of two-dimensional." + assert _all_equal( + [arg.shape for arg in args] + ), "Inputs arrays must have same the shape." + assert all( + [np.min(arg) >= 0 for arg in args] + ), "Input arrays must be non-negative." + + +def auc(vals_x: np.ndarray, vals_y: np.ndarray) -> Union[np.ndarray, float]: + """ + Compute the approximate area under the curve. + + This is a weak wrapper around np.trapz, ensuring that the integral is + always positive, i.e. making it ignore the sorting order of the input + numpy array. + + Parameters + ---------- + vals_x : np.ndarray + Must be squeezable to shape (-1,) or (-1, n_classes). + vals_y : np.ndarray + Must be squeezable to shape (-1,) or (-1, n_classes). + + Returns + ------- + Union[np.ndarray, float] + Approximate AUC value. Either single float or np.ndarray of shape + (vals_x.shape[1],). + """ + vals_x = np.squeeze(vals_x) + vals_y = np.squeeze(vals_y) + + if vals_x.ndim == 1: + return np.abs(np.trapz(x=vals_x, y=vals_y)) + elif vals_x.ndim == 2: + return np.array( + [ + np.trapz(x=vals_x[..., i], y=vals_y[..., i]) + for i in range(vals_x.shape[-1]) + ] + ) + else: + raise NotImplementedError("Inputs must be 1 or 2 dimensional.") diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/metrics/test_accuracy.py b/tests/metrics/test_accuracy.py new file mode 100644 index 0000000..fbc5a88 --- /dev/null +++ b/tests/metrics/test_accuracy.py @@ -0,0 +1,179 @@ +import unittest +import numpy as np +from unittest.mock import patch +from streamauc.metrics._accuracy import ( + accuracy, + accuracy_micro, + accuracy_macro, + accuracy_onevsall, +) +from streamauc.utils import AggregationMethod + + +class TestAccuracyFunction(unittest.TestCase): + @patch("streamauc.metrics._accuracy.accuracy_micro") + def test_accuracy_micro(self, mock_acc_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_acc_micro.return_value = np.array([0.83333333, 0.92307692]) + + result = accuracy( + tp, tn, fp, fn, method=AggregationMethod.MICRO, check_inputs=True + ) + + mock_acc_micro.assert_called_once_with( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_acc_micro.return_value) + + @patch("streamauc.metrics._accuracy.accuracy_macro") + def test_accuracy_macro(self, mock_acc_macro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_acc_macro.return_value = np.array([0.83333333, 0.88888889]) + + result = accuracy( + tp, tn, fp, fn, method=AggregationMethod.MACRO, check_inputs=True + ) + + mock_acc_macro.assert_called_once_with( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_acc_macro.return_value) + + @patch("streamauc.metrics._accuracy.accuracy_onevsall") + def test_accuracy_onevsall(self, mock_acc_onevsall): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + class_index = 1 + + mock_acc_onevsall.return_value = np.array( + [[0.83333333, 0.92307692], [0.8, 0.9]] + ) + + result = accuracy( + tp, + tn, + fp, + fn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_acc_onevsall.assert_called_once_with( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True + ) + + expected_acc = mock_acc_onevsall.return_value[..., class_index] + np.testing.assert_almost_equal(result, expected_acc) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + with self.assertRaises(ValueError): + accuracy( + tp, tn, fp, fn, method="invalid_method", check_inputs=True + ) + + +class TestAccuracyOneVsAll(unittest.TestCase): + def test_accuracy_onevsall(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + expected_acc = (tp + tn) / (tp + tn + fp + fn) + + result = accuracy_onevsall( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True + ) + np.testing.assert_almost_equal(result, expected_acc) + + def test_zero_division_onevsall(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_acc = np.array([0.0, 0.0]) + result = accuracy_onevsall( + tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True + ) + self.assertEqual(result.shape, tp.shape) + + np.testing.assert_almost_equal(result[:, 0], expected_acc) + np.testing.assert_almost_equal(result[:, 1], expected_acc) + + +class TestAccuracyMacro(unittest.TestCase): + def test_accuracy_macro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + expected_acc_onevsall = (tp + tn) / (tp + tn + fp + fn) + expected_acc_macro = np.mean(expected_acc_onevsall, axis=-1) + + result = accuracy_macro(tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(result, expected_acc_macro) + + def test_zero_division_macro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_acc = np.array([0.0, 0.0]) + result = accuracy_micro(tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True) + np.testing.assert_almost_equal(result, expected_acc) + + +class TestAccuracyMicro(unittest.TestCase): + def test_accuracy_micro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + tp_sum = np.sum(tp, axis=-1) + tn_sum = np.sum(tn, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + + expected_acc_micro = (tp_sum + tn_sum) / ( + tp_sum + tn_sum + fp_sum + fn_sum + ) + + result = accuracy_micro(tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(result, expected_acc_micro) + + def test_zero_division_micro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_acc = np.array([0.0, 0.0]) + result = accuracy_micro(tp=tp, tn=tn, fn=fn, fp=fp, check_inputs=True) + np.testing.assert_almost_equal(result, expected_acc) diff --git a/tests/metrics/test_f1_score.py b/tests/metrics/test_f1_score.py new file mode 100644 index 0000000..def4b66 --- /dev/null +++ b/tests/metrics/test_f1_score.py @@ -0,0 +1,171 @@ +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.metrics._f1_score import ( + f1_score, + f1_onevsall, + f1_macro, + f1_micro, +) +from streamauc.metrics._precision import precision_micro, precision_onevsall +from streamauc.metrics._tpr import tpr_micro, tpr_onevsall + +from streamauc.utils import AggregationMethod + + +class TestF1ScoreFunction(unittest.TestCase): + @patch("streamauc.metrics._f1_score.f1_micro") + def test_f1_micro(self, mock_f1_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_f1_micro.return_value = np.array([0.83333333, 0.92307692]) + + result = f1_score( + tp, fp, fn, method=AggregationMethod.MICRO, check_inputs=True + ) + + mock_f1_micro.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_f1_micro.return_value) + + @patch("streamauc.metrics._f1_score.f1_macro") + def test_f1_macro(self, mock_f1_macro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_f1_macro.return_value = np.array([0.83333333, 0.88888889]) + + result = f1_score( + tp, fp, fn, method=AggregationMethod.MACRO, check_inputs=True + ) + + mock_f1_macro.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_f1_macro.return_value) + + @patch("streamauc.metrics._f1_score.f1_onevsall") + def test_f1_onevsall(self, mock_f1_onevsall): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + class_index = 1 + + mock_f1_onevsall.return_value = np.array( + [[0.83333333, 0.92307692], [0.8, 0.9]] + ) + + result = f1_score( + tp, + fp, + fn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_f1_onevsall.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + expected_f1 = mock_f1_onevsall.return_value[..., class_index] + np.testing.assert_almost_equal(result, expected_f1) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + with self.assertRaises(ValueError): + f1_score(tp, fp, fn, method="invalid_method", check_inputs=True) + + +class TestF1OneVsAll(unittest.TestCase): + def test_f1_onevsall(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + _precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True) + _tpr = tpr_onevsall(tp=tp, fn=fn, check_inputs=True) + _expected_f1 = (2 * _precision * _tpr) / (_precision + _tpr + 1e-12) + + result = f1_onevsall(tp=tp, fp=fp, fn=fn, check_inputs=True) + np.testing.assert_almost_equal(result, _expected_f1) + + def test_zero_division_onevsall(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + _expected_f1 = np.array([0.0, 0.0]) + result = f1_onevsall(tp=tp, fp=fp, fn=fn, check_inputs=True) + + self.assertEqual(result.shape, tp.shape) + + np.testing.assert_almost_equal(result[:, 0], _expected_f1) + np.testing.assert_almost_equal(result[:, 1], _expected_f1) + np.testing.assert_almost_equal(result[:, 2], _expected_f1) + + +class TestF1Macro(unittest.TestCase): + def test_f1_macro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + _precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True) + _tpr = tpr_onevsall(tp=tp, fn=fn, check_inputs=True) + _expected_f1_macro = ( + (2 * _precision * _tpr) / (_precision + _tpr + 1e-12) + ).mean(-1) + + result = f1_macro(tp=tp, fp=fp, fn=fn, check_inputs=True) + + np.testing.assert_almost_equal(result, _expected_f1_macro) + + def test_zero_division_macro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + _expected_f1 = np.array([0.0, 0.0]) + result = f1_macro(tp=tp, fp=fp, fn=fn, check_inputs=True) + np.testing.assert_almost_equal(result, _expected_f1) + + +# +class TestF1Micro(unittest.TestCase): + def test_f1_micro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + _precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + _tpr = tpr_micro(tp=tp, fn=fn, check_inputs=True) + _expected_f1_micro = (2 * _precision * _tpr) / ( + _precision + _tpr + 1e-12 + ) + + result = f1_micro(tp=tp, fp=fp, fn=fn, check_inputs=True) + + np.testing.assert_almost_equal(result, _expected_f1_micro) + + def test_zero_division_micro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + _expected_f1 = np.array([0.0, 0.0]) + result = f1_micro(tp=tp, fp=fp, fn=fn, check_inputs=True) + np.testing.assert_almost_equal(result, _expected_f1) diff --git a/tests/metrics/test_fnr.py b/tests/metrics/test_fnr.py new file mode 100644 index 0000000..ab7d1c1 --- /dev/null +++ b/tests/metrics/test_fnr.py @@ -0,0 +1,70 @@ +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.metrics._fnr import fnr + +from streamauc.utils import AggregationMethod + + +class TestFNRFunction(unittest.TestCase): + @patch("streamauc.metrics._tpr.tpr_micro") + def test_fnr_micro(self, mock_tpr): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_tpr.return_value = np.array([0.83333333, 0.92307692]) + + result = fnr(tp, fn, method=AggregationMethod.MICRO, check_inputs=True) + + mock_tpr.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + + expected_fnr = 1 - mock_tpr.return_value + np.testing.assert_almost_equal(result, expected_fnr) + + @patch("streamauc.metrics._tpr.tpr_macro") + def test_fnr_macro(self, mock_tpr): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_tpr.return_value = np.array([0.83333333, 0.88888889]) + + result = fnr(tp, fn, method=AggregationMethod.MACRO, check_inputs=True) + + mock_tpr.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + + expected_fnr = 1 - mock_tpr.return_value + np.testing.assert_almost_equal(result, expected_fnr) + + @patch("streamauc.metrics._tpr.tpr_onevsall") + def test_fnr_onevsall(self, mock_tpr): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + class_index = 1 + + mock_tpr.return_value = np.array( + [[0.83333333, 0.92307692], [0.8, 0.9]] + ) + + result = fnr( + tp, + fn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + mock_tpr.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + expected_fnr = 1 - mock_tpr.return_value[..., class_index] + + np.testing.assert_almost_equal(result, expected_fnr) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + with self.assertRaises(ValueError): + fnr(tp, fn, method="invalid_method", check_inputs=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/metrics/test_fpr.py b/tests/metrics/test_fpr.py new file mode 100644 index 0000000..1b8c3bc --- /dev/null +++ b/tests/metrics/test_fpr.py @@ -0,0 +1,216 @@ +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.utils import * +from streamauc.metrics._fpr import fpr_micro, fpr_macro, fpr_onevsall, fpr + + +# Test case with a simple example +tp = np.array([[10, 5, 0], [7, 8, 9]]) +fp = np.array([[2, 1, 0], [1, 0, 1]]) +fn = np.array([[1, 0, 2], [0, 2, 1]]) +tn = np.array([[5, 10, 15], [20, 15, 10]]) + + +class TestFPRFunction(unittest.TestCase): + @patch("streamauc.metrics._fpr.fpr_micro") + @patch("streamauc.metrics._fpr.fpr_macro") + @patch("streamauc.metrics._fpr.fpr_onevsall") + def test_fpr_micro(self, mock_onevsall, mock_macro, mock_micro): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + mock_micro.return_value = np.array([0.16666667, 0.07692308]) + + result = fpr(fp, tn, method=AggregationMethod.MICRO, check_inputs=True) + + mock_micro.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + mock_macro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_micro.return_value) + + @patch("streamauc.metrics._fpr.fpr_micro") + @patch("streamauc.metrics._fpr.fpr_macro") + @patch("streamauc.metrics._fpr.fpr_onevsall") + def test_fpr_macro(self, mock_onevsall, mock_macro, mock_micro): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + mock_macro.return_value = np.array([0.16666667, 0.11111111]) + + result = fpr(fp, tn, method=AggregationMethod.MACRO, check_inputs=True) + + mock_macro.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + mock_micro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_macro.return_value) + + @patch("streamauc.metrics._fpr.fpr_micro") + @patch("streamauc.metrics._fpr.fpr_macro") + @patch("streamauc.metrics._fpr.fpr_onevsall") + def test_fpr_onevsall(self, mock_onevsall, mock_macro, mock_micro): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + class_index = 1 + + mock_onevsall.return_value = np.array( + [[0.16666667, 0.07692308], [0.125, 0.1]] + ) + + result = fpr( + fp, + tn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_onevsall.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + mock_micro.assert_not_called() + mock_macro.assert_not_called() + + np.testing.assert_almost_equal( + result, mock_onevsall.return_value[..., class_index] + ) + + def test_invalid_method(self): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + with self.assertRaises(ValueError): + fpr(fp, tn, method="invalid_method", check_inputs=True) + + +class TestComputeFprMicro(unittest.TestCase): + def test_compute_roc_micro(self): + expected_fpr = np.array([0.09090909, 0.04255319]) + + fpr = fpr_micro(fp=fp, tn=tn) + + np.testing.assert_almost_equal(fpr, expected_fpr) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0, 0], [0, 0, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_fpr = np.array([0.0, 0.0]) + + fpr = fpr_micro(fp=fp, tn=tn) + + np.testing.assert_almost_equal(fpr, expected_fpr) + + def test_invalid_inputs(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0, 0], [0, -1, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + fpr = fpr_micro(fp=fp, tn=tn) + + fp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + fpr = fpr_micro(fp=fp, tn=tn) + + fp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + fpr = fpr_micro(fp=fp, tn=tn) + + fpr = fpr_micro(fp=fp, tn=tn, check_inputs=False) + + +class TestComputeFprMacro(unittest.TestCase): + + def test_compute_roc_macro(self): + # Test case with a simple example + fp = np.array([[2, 1], [1, 0]]) + tn = np.array([[5, 10], [20, 15]]) + + tpr_classwise = np.array([[0.90909091, 1.0], [1.0, 0.8]]) + + fpr_classwise = np.array([[0.28571429, 0.09090909], [0.04761905, 0.0]]) + + expected_tpr = np.mean(tpr_classwise, -1) + expected_fpr = np.mean(fpr_classwise, -1) + + fpr = fpr_macro(fp=fp, tn=tn) + + np.testing.assert_almost_equal(fpr, expected_fpr) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0], [0, 0]]) + tn = np.array([[0, 0], [0, 0]]) + + expected_fpr = np.array([0.0, 0.0]) + + fpr = fpr_macro(fp=fp, tn=tn) + + np.testing.assert_almost_equal(fpr, expected_fpr) + + def test_invalid_inputs(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0, 0], [0, -1, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + fpr = fpr_macro(fp=fp, tn=tn) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + fpr = fpr_macro(fp=fp, tn=tn) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + fpr = fpr_macro(fp=fp, tn=tn) + + fpr = fpr_macro(fp=fp, tn=tn, check_inputs=False) + + +class TestComputeFpr1vsAll(unittest.TestCase): + + def test_compute_roc_1vsall(self): + # Test case with a simple example + fp = np.array([[2, 1], [1, 0]]) + tn = np.array([[5, 10], [20, 15]]) + + expected_fpr = np.array([[0.28571429, 0.09090909], [0.04761905, 0.0]]) + + for i in range(tn.shape[1]): + fpr = fpr_onevsall(fp=fp, tn=tn)[:, i] + np.testing.assert_almost_equal(fpr, expected_fpr[:, i]) + + def test_zero_division_1vsall(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0], [0, 0]]) + tn = np.array([[0, 0], [0, 0]]) + class_index = 0 + + expected_fpr = np.array([0.0, 0.0]) + + fpr = fpr_onevsall(fp=fp, tn=tn)[:, class_index] + + np.testing.assert_almost_equal(fpr, expected_fpr) + + def test_invalid_inputs_1vsall(self): + # Test case with zeros to ensure no division by zero errors + fp = np.array([[0, 0, 0], [0, -1, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + fpr = fpr_onevsall(fp=fp, tn=tn) + + fp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + fpr = fpr_onevsall(fp=fp, tn=tn) + + fp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + fpr = fpr_onevsall(fp=fp, tn=tn) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/metrics/test_jaccard.py b/tests/metrics/test_jaccard.py new file mode 100644 index 0000000..7313324 --- /dev/null +++ b/tests/metrics/test_jaccard.py @@ -0,0 +1,159 @@ +import unittest +import numpy as np +from unittest.mock import patch +from streamauc.metrics._jaccard_index import ( + jaccard_index, + jaccard_index_micro, + jaccard_index_macro, + jaccard_index_onevsall, +) +from streamauc.utils import AggregationMethod + + +class TestJaccardIndexFunction(unittest.TestCase): + @patch("streamauc.metrics._jaccard_index.jaccard_index_micro") + def test_jaccard_index_micro(self, mock_jaccard_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_jaccard_micro.return_value = np.array([0.76923077, 0.81818182]) + + result = jaccard_index( + tp, fp, fn, method=AggregationMethod.MICRO, check_inputs=True + ) + + mock_jaccard_micro.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_jaccard_micro.return_value) + + @patch("streamauc.metrics._jaccard_index.jaccard_index_macro") + def test_jaccard_index_macro(self, mock_jaccard_macro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_jaccard_macro.return_value = np.array([0.66666667, 0.85714286]) + + result = jaccard_index( + tp, fp, fn, method=AggregationMethod.MACRO, check_inputs=True + ) + + mock_jaccard_macro.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + np.testing.assert_almost_equal(result, mock_jaccard_macro.return_value) + + @patch("streamauc.metrics._jaccard_index.jaccard_index_onevsall") + def test_jaccard_index_onevsall(self, mock_jaccard_onevsall): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + class_index = 1 + + mock_jaccard_onevsall.return_value = np.array( + [[0.76923077, 0.83333333, 0], [0.875, 1.0, 0.9]] + ) + + result = jaccard_index( + tp, + fp, + fn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_jaccard_onevsall.assert_called_once_with( + tp=tp, fn=fn, fp=fp, check_inputs=True + ) + + expected_jaccard = mock_jaccard_onevsall.return_value[..., class_index] + np.testing.assert_almost_equal(result, expected_jaccard) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + with self.assertRaises(ValueError): + jaccard_index( + tp, fp, fn, method="invalid_method", check_inputs=True + ) + + +class TestJaccardIndexOneVsAll(unittest.TestCase): + def test_jaccard_index_onevsall(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + expected_jaccard = tp / (tp + fn + fp) + result = jaccard_index_onevsall(tp=tp, fn=fn, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(result, expected_jaccard) + + def test_zero_division_onevsall(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_jaccard = np.array([0.0, 0.0]) + result = jaccard_index_onevsall(tp=tp, fp=fp, fn=fn, check_inputs=True) + self.assertEqual(result.shape, tp.shape) + np.testing.assert_almost_equal(result[:, 0], expected_jaccard) + np.testing.assert_almost_equal(result[:, 1], expected_jaccard) + + +class TestJaccardIndexMacro(unittest.TestCase): + def test_jaccard_index_macro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + expected_jaccard_onevsall = tp / (tp + fn + fp) + expected_jaccard_macro = np.mean(expected_jaccard_onevsall, axis=-1) + + result = jaccard_index_macro(tp=tp, fn=fn, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(result, expected_jaccard_macro) + + def test_zero_division_macro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + _expected_jaccard = np.array([0.0, 0.0]) + result = jaccard_index_macro(tp=tp, fp=fp, fn=fn, check_inputs=True) + np.testing.assert_almost_equal(result, _expected_jaccard) + + +class TestJaccardIndexMicro(unittest.TestCase): + def test_jaccard_index_micro(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + tp_sum = np.sum(tp, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + + expected_jaccard = tp_sum / (tp_sum + fn_sum + fp_sum) + result = jaccard_index_micro(tp=tp, fn=fn, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(result, expected_jaccard) + + def test_zero_division_micro(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + _expected_jaccard = np.array([0.0, 0.0]) + result = jaccard_index_micro(tp=tp, fp=fp, fn=fn, check_inputs=True) + np.testing.assert_almost_equal(result, _expected_jaccard) diff --git a/tests/metrics/test_precision.py b/tests/metrics/test_precision.py new file mode 100644 index 0000000..2de74bd --- /dev/null +++ b/tests/metrics/test_precision.py @@ -0,0 +1,226 @@ +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.utils import * +from streamauc.metrics._precision import ( + precision_micro, + precision_macro, + precision_onevsall, + precision, +) + + +class TestPrecisionFunction(unittest.TestCase): + @patch("streamauc.metrics._precision.precision_micro") + @patch("streamauc.metrics._precision.precision_macro") + @patch("streamauc.metrics._precision.precision_onevsall") + def test_precision_micro(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + mock_micro.return_value = np.array([0.83333333, 0.92307692]) + + result = precision( + tp, fp, method=AggregationMethod.MICRO, check_inputs=True + ) + + mock_micro.assert_called_once_with(tp=tp, fp=fp, check_inputs=True) + mock_macro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_micro.return_value) + + @patch("streamauc.metrics._precision.precision_micro") + @patch("streamauc.metrics._precision.precision_macro") + @patch("streamauc.metrics._precision.precision_onevsall") + def test_precision_macro(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + mock_macro.return_value = np.array([0.83333333, 0.88888889]) + + result = precision( + tp, fp, method=AggregationMethod.MACRO, check_inputs=True + ) + + mock_macro.assert_called_once_with(tp=tp, fp=fp, check_inputs=True) + mock_micro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_macro.return_value) + + @patch("streamauc.metrics._precision.precision_micro") + @patch("streamauc.metrics._precision.precision_macro") + @patch("streamauc.metrics._precision.precision_onevsall") + def test_precision_onevsall(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + class_index = 1 + + mock_onevsall.return_value = np.array( + [[0.83333333, 0.92307692], [0.8, 0.9]] + ) + + result = precision( + tp, + fp, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_onevsall.assert_called_once_with(tp=tp, fp=fp, check_inputs=True) + mock_micro.assert_not_called() + mock_macro.assert_not_called() + + np.testing.assert_almost_equal( + result, mock_onevsall.return_value[..., class_index] + ) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + with self.assertRaises(ValueError): + precision(tp, fp, method="invalid_method", check_inputs=True) + + +class TestComputePrecisionMicro(unittest.TestCase): + def test_compute_precision_unit(self): + # Test case with a simple example + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + expected_precision = np.array([0.83333333, 0.92307692]) + + precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(precision, expected_precision) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_precision = np.array([0.0, 0.0]) + + precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(precision, expected_precision) + + def test_invalid_inputs(self): + # Test case with invalid inputs to ensure proper error handling + tp = np.array([[0, 0, 0], [0, -1, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + precision = precision_micro(tp=tp, fp=fp, check_inputs=True) + + # This should not throw an error anymore + precision = precision_micro(tp=tp, fp=fp, check_inputs=False) + + +class TestComputePrecisionMacro(unittest.TestCase): + def test_compute_precision_unit(self): + # Test case with a simple example + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + expected_precision_indiv = np.array( + [[0.83333333, 0.83333333, 0], [0.875, 1.0, 0.9]] + ) + expected_precision = expected_precision_indiv.mean(-1, keepdims=False) + + precision = precision_macro(tp=tp, fp=fp, check_inputs=True) + + np.testing.assert_almost_equal(precision, expected_precision) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_precision = np.array([0.0, 0.0]) + + precision = precision_macro(tp=tp, fp=fp, check_inputs=True) + np.testing.assert_almost_equal(precision, expected_precision) + + def test_invalid_inputs(self): + # Test case with invalid inputs to ensure proper error handling + tp = np.array([[0, 0, 0], [0, -1, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + precision = precision_macro(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + precision = precision_macro(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + precision = precision_macro(tp=tp, fp=fp, check_inputs=True) + + # This should not throw an error anymore + precision = precision_macro(tp=tp, fp=fp, check_inputs=False) + + +class TestComputePrecisionOneVsAll(unittest.TestCase): + def test_compute_precision_unit(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fp = np.array([[2, 1, 0], [1, 0, 1]]) + + expected_precision = np.array( + [[0.83333333, 0.83333333, 0], [0.875, 1.0, 0.9]] + ) + + for i in range(tp.shape[-1]): + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True)[ + :, i + ] + np.testing.assert_almost_equal(precision, expected_precision[:, i]) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_precision = np.array([0.0, 0.0]) + + for i in range(tp.shape[-1]): + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True)[ + :, i + ] + np.testing.assert_almost_equal(precision, expected_precision) + + def test_invalid_inputs(self): + # Test case with invalid inputs to ensure proper error handling + tp = np.array([[0, 0, 0], [0, -1, 0]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=True) + + # This should not throw an error anymore + precision = precision_onevsall(tp=tp, fp=fp, check_inputs=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/metrics/test_tnr.py b/tests/metrics/test_tnr.py new file mode 100644 index 0000000..788d888 --- /dev/null +++ b/tests/metrics/test_tnr.py @@ -0,0 +1,71 @@ +# test_tnr.py +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.metrics._tnr import tnr +from streamauc.utils import AggregationMethod + + +class TestTNRFunction(unittest.TestCase): + @patch("streamauc.metrics._fpr.fpr_micro") + def test_tnr_micro(self, mock_fpr): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + mock_fpr.return_value = np.array([0.16666667, 0.07692308]) + + result = tnr(fp, tn, method=AggregationMethod.MICRO, check_inputs=True) + + mock_fpr.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + + expected_tnr = 1 - mock_fpr.return_value + np.testing.assert_almost_equal(result, expected_tnr) + + @patch("streamauc.metrics._fpr.fpr_macro") + def test_tnr_macro(self, mock_fpr): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + mock_fpr.return_value = np.array([0.16666667, 0.11111111]) + + result = tnr(fp, tn, method=AggregationMethod.MACRO, check_inputs=True) + + mock_fpr.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + + expected_tnr = 1 - mock_fpr.return_value + np.testing.assert_almost_equal(result, expected_tnr) + + @patch("streamauc.metrics._fpr.fpr_onevsall") + def test_tnr_onevsall(self, mock_fpr): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + class_index = 1 + + mock_fpr.return_value = np.array( + [[0.16666667, 0.07692308], [0.125, 0.1]] + ) + + result = tnr( + fp, + tn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_fpr.assert_called_once_with(fp=fp, tn=tn, check_inputs=True) + + expected_tnr = 1 - mock_fpr.return_value[..., class_index] + np.testing.assert_almost_equal(result, expected_tnr) + + def test_invalid_method(self): + fp = np.array([[2, 1, 0], [1, 0, 1]]) + tn = np.array([[10, 5, 0], [7, 8, 9]]) + + with self.assertRaises(ValueError): + tnr(fp, tn, method="invalid_method", check_inputs=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/metrics/test_tpr.py b/tests/metrics/test_tpr.py new file mode 100644 index 0000000..21adbca --- /dev/null +++ b/tests/metrics/test_tpr.py @@ -0,0 +1,211 @@ +import unittest +from unittest.mock import patch +import numpy as np + +from streamauc.metrics._tpr import tpr_micro, tpr_macro, tpr_onevsall, tpr +from streamauc.utils import * + + +# Test case with a simple example +tp = np.array([[10, 5, 0], [7, 8, 9]]) +fp = np.array([[2, 1, 0], [1, 0, 1]]) +fn = np.array([[1, 0, 2], [0, 2, 1]]) +tn = np.array([[5, 10, 15], [20, 15, 10]]) + + +class TestTPRFunction(unittest.TestCase): + @patch("streamauc.metrics._tpr.tpr_micro") + @patch("streamauc.metrics._tpr.tpr_macro") + @patch("streamauc.metrics._tpr.tpr_onevsall") + def test_tpr_micro(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_micro.return_value = np.array([0.83333333, 0.92307692]) + + result = tpr(tp, fn, method=AggregationMethod.MICRO, check_inputs=True) + + mock_micro.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + mock_macro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_micro.return_value) + + @patch("streamauc.metrics._tpr.tpr_micro") + @patch("streamauc.metrics._tpr.tpr_macro") + @patch("streamauc.metrics._tpr.tpr_onevsall") + def test_tpr_macro(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + mock_macro.return_value = np.array([0.83333333, 0.88888889]) + + result = tpr(tp, fn, method=AggregationMethod.MACRO, check_inputs=True) + + mock_macro.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + mock_micro.assert_not_called() + mock_onevsall.assert_not_called() + + np.testing.assert_almost_equal(result, mock_macro.return_value) + + @patch("streamauc.metrics._tpr.tpr_micro") + @patch("streamauc.metrics._tpr.tpr_macro") + @patch("streamauc.metrics._tpr.tpr_onevsall") + def test_tpr_onevsall(self, mock_onevsall, mock_macro, mock_micro): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + class_index = 1 + + mock_onevsall.return_value = np.array( + [[0.83333333, 0.92307692], [0.8, 0.9]] + ) + + result = tpr( + tp, + fn, + method=AggregationMethod.ONE_VS_ALL, + class_index=class_index, + check_inputs=True, + ) + + mock_onevsall.assert_called_once_with(tp=tp, fn=fn, check_inputs=True) + mock_micro.assert_not_called() + mock_macro.assert_not_called() + + np.testing.assert_almost_equal( + result, mock_onevsall.return_value[..., class_index] + ) + + def test_invalid_method(self): + tp = np.array([[10, 5, 0], [7, 8, 9]]) + fn = np.array([[1, 0, 2], [0, 2, 1]]) + + with self.assertRaises(ValueError): + tpr(tp, fn, method="invalid_method", check_inputs=True) + + +class TestComputeTprMicro(unittest.TestCase): + def test_compute_tpr_micro(self): + expected_tpr = np.array([0.83333333, 0.88888889]) + _tpr = tpr_micro(tp=tp, fn=fn) + np.testing.assert_almost_equal(_tpr, expected_tpr) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + expected_tpr = np.array([0.0, 0.0]) + + _tpr = tpr_micro(tp=tp, fn=fn) + + np.testing.assert_almost_equal(_tpr, expected_tpr) + + def test_invalid_inputs(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, -1, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + _tpr = tpr_micro(tp=tp, fn=fn) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + _tpr = tpr_micro(tp=tp, fn=fn) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + _tpr = tpr_micro(tp=tp, fn=fn) + + _tpr = tpr_micro(tp=tp, fn=fn, check_inputs=False) + + +class TestComputeTprMacro(unittest.TestCase): + def test_compute_roc_macro(self): + # Test case with a simple example + tp = np.array([[10, 5], [7, 8]]) + fn = np.array([[1, 0], [0, 2]]) + + tpr_classwise = np.array([[0.90909091, 1.0], [1.0, 0.8]]) + + expected_tpr = np.mean(tpr_classwise, -1) + + _tpr = tpr_macro(tp=tp, fn=fn) + np.testing.assert_almost_equal(_tpr, expected_tpr) + + def test_zero_division(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0], [0, 0]]) + fn = np.array([[0, 0], [0, 0]]) + + expected_tpr = np.array([0.0, 0.0]) + + _tpr = tpr_macro(tp=tp, fn=fn) + + np.testing.assert_almost_equal(_tpr, expected_tpr) + + def test_invalid_inputs(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, -1]]) + fp = np.array([[0, 0, 0], [0, 0, 0]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + tn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + _tpr = tpr_macro(tp=tp, fn=fn) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + _tpr = tpr_macro(tp=tp, fn=fn) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + _tpr = tpr_macro(tp=tp, fn=fn) + + _tpr = tpr_macro(tp=tp, fn=fn, check_inputs=False) + + +class TestComputeRoc1vsAll(unittest.TestCase): + + def test_compute_roc_1vsall(self): + # Test case with a simple example + tp = np.array([[10, 5], [7, 8]]) + fn = np.array([[1, 0], [0, 2]]) + + expected_tpr = np.array([[0.90909091, 1.0], [1.0, 0.8]]) + + for i in range(tp.shape[1]): + _tpr = tpr_onevsall(tp=tp, fn=fn)[:, i] + np.testing.assert_almost_equal(_tpr, expected_tpr[:, i]) + + def test_zero_division_1vsall(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0], [0, 0]]) + fn = np.array([[0, 0], [0, 0]]) + class_index = 0 + + expected_tpr = np.array([0.0, 0.0]) + + _tpr = tpr_onevsall(tp=tp, fn=fn)[:, class_index] + + np.testing.assert_almost_equal(_tpr, expected_tpr) + + def test_invalid_inputs_1vsall(self): + # Test case with zeros to ensure no division by zero errors + tp = np.array([[0, 0, 0], [0, 0, -1]]) + fn = np.array([[0, 0, 0], [0, 0, 0]]) + + with self.assertRaises(AssertionError): + _tpr = tpr_onevsall(tp=tp, fn=fn) + + tp = np.array([[0, 0, 0, 0], [0, 1, 0, 0]]) + with self.assertRaises(AssertionError): + _tpr = tpr_macro(tp=tp, fn=fn) + + tp = np.array([[[0, 0, 0], [0, 1, 2]]]) + with self.assertRaises(AssertionError): + _tpr = tpr_macro(tp=tp, fn=fn) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_base.py b/tests/test_base.py deleted file mode 100644 index 242732a..0000000 --- a/tests/test_base.py +++ /dev/null @@ -1,5 +0,0 @@ -from streamauc.base import NAME - - -def test_base(): - assert NAME == "streamauc" diff --git a/tests/test_streaming_metrics.py b/tests/test_streaming_metrics.py new file mode 100644 index 0000000..c266e0a --- /dev/null +++ b/tests/test_streaming_metrics.py @@ -0,0 +1,419 @@ +import numpy as np +import unittest +from streamauc import StreamingMetrics, AggregationMethod, auc +from streamauc.metrics import f1_score, tpr, fpr +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn import metrics +from sklearn.preprocessing import LabelBinarizer +from sklearn.metrics import ( + precision_recall_curve as sk_precision_recall_curve, + roc_curve as sk_roc_curve, + confusion_matrix, +) + +_conf_matr_multiclass = np.array( + [ + [[19, 10, 17], [0, 14, 17], [0, 3, 2]], + [[5, 10, 10], [16, 2, 12], [6, 9, 12]], + [[4, 0, 16], [3, 10, 8], [18, 13, 10]], + [[0, 15, 19], [3, 20, 0], [3, 18, 4]], + ] +) + + +class TestInit(unittest.TestCase): + def setUp(self): + np.random.seed(1234) + + def test_unsorted_thresholds(self): + thresholds = np.linspace(0, 1, 100) + np.random.shuffle(thresholds) + + self.assertFalse(np.all(thresholds[:-1] >= thresholds[1:])) + curve = StreamingMetrics( + thresholds=thresholds, + num_classes=3, + ) + self.assertTrue(np.all(curve.thresholds[:-1] >= curve.thresholds[1:])) + + def test_invalid_input(self): + thresholds = np.arange(20) + with self.assertRaises(ValueError): + curve = StreamingMetrics( + thresholds=thresholds, + num_classes=3, + ) + thresholds = thresholds[:1] + with self.assertRaises(ValueError): + curve = StreamingMetrics( + thresholds=thresholds, + num_classes=3, + ) + with self.assertRaises(ValueError): + curve = StreamingMetrics( + num_thresholds=1, + num_classes=3, + ) + with self.assertRaises(ValueError): + curve = StreamingMetrics( + num_thresholds=10, + num_classes=1, + ) + + +class TestConfusionMatrixUpdates(unittest.TestCase): + def setUp(self): + np.random.seed(1234) + + def test_reset(self): + thresholds = np.linspace(0, 1, 100) + curve = StreamingMetrics( + thresholds=thresholds, + num_classes=3, + ) + expected_empty_confm = np.zeros( + (len(thresholds) + 2, curve.num_classes, 2, 2), + dtype=int, + ) + np.testing.assert_array_equal( + curve.confusion_matrix, expected_empty_confm + ) + + y_true = np.random.randint(0, 2, (10, curve.num_classes)) + y_pred = np.random.random((10, curve.num_classes)) + y_pred = y_pred / y_pred.sum(-1, keepdims=True) + + curve.update(y_true=y_true, y_score=y_pred) + conf1 = curve.confusion_matrix.copy() + + curve.reset() + y_true = np.argmax(y_true, -1) + curve.update(y_true=y_true, y_score=y_pred) + conf2 = curve.confusion_matrix.copy() + np.testing.assert_array_equal(conf1, conf2) + + self.assertFalse( + np.allclose(curve.confusion_matrix, expected_empty_confm) + ) + + curve.reset() + self.assertTrue( + np.allclose(curve.confusion_matrix, expected_empty_confm) + ) + + def test_invalid_input(self): + + thresholds = np.linspace(0, 1, 100) + curve = StreamingMetrics( + thresholds=thresholds, + num_classes=3, + ) + with self.assertRaises(ValueError): + y_true = np.random.randint(0, 2, (10, curve.num_classes)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes + 1)) + curve.update(y_true=y_true, y_score=y_pred) + + with self.assertRaises(ValueError): + y_true = np.random.randint(0, 2, (11, curve.num_classes)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes)) + curve.update(y_true=y_true, y_score=y_pred) + + with self.assertRaises(ValueError): + y_true = np.random.randint(0, 2, (10, curve.num_classes, 2, 2)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes, 2, 2)) + curve.update(y_true=y_true, y_score=y_pred) + with self.assertRaises(ValueError): + y_true = np.random.randint(0, 2, (10, curve.num_classes)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes, 2, 2)) + curve.update(y_true=y_true, y_score=y_pred) + + # should not throw any errors + y_true = np.random.randint(0, 2, (10, curve.num_classes, 1, 1, 1, 1)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes)) + curve.update(y_true=y_true, y_score=y_pred) + + # should not throw any errors + y_true = np.random.randint(0, 2, (10,)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes)) + curve.update(y_true=y_true, y_score=y_pred) + + # should not throw any errors + y_true = np.random.randint(0, 2, (10, 1, 1, 1, 1)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes)) + curve.update(y_true=y_true, y_score=y_pred) + + # should not throw any errors + y_true = np.random.randint(0, 2, (10, 1, 1, 1, 1)) + y_pred = np.random.randint(0, 2, (10, curve.num_classes)) + curve.update(y_true=y_true, y_score=y_pred, check_inputs=False) + + # y_true = np.random.randint(0, 2, (10, curve.num_classes, 1, 2)) + # y_pred = np.random.randint(0, 2, (10, curve.num_classes, 1, 2)) + # curve.update(y_true=y_true, y_pred=y_pred, check_inputs=False) + + +class TestMetricsMulticlass(unittest.TestCase): + def setUp(self): + self.num_thresholds = 3 + self.dim = 2 + self.confusion_matrix = _conf_matr_multiclass + + self.curve = StreamingMetrics( + num_thresholds=self.num_thresholds, + num_classes=self.dim, + ) + self.curve._confusion_matrix = self.confusion_matrix + + def test_true_negatives(self): + calculated_val = self.curve.calc_metric( + metric=f1_score, method=AggregationMethod.MACRO + ) + expected_val = f1_score( + tp=self.curve.true_positives(), + fn=self.curve.false_negatives(), + fp=self.curve.false_positives(), + ) + np.testing.assert_array_equal(expected_val, calculated_val) + + +class TestAUCMulticlass(unittest.TestCase): + def setUp(self): + + iris = load_iris() + X, y = iris.data, iris.target + y = iris.target_names[y] + + random_state = np.random.RandomState(0) + n_samples, n_features = X.shape + X = np.concatenate( + [X, random_state.randn(n_samples, 200 * n_features)], axis=1 + ) + ( + X_train, + X_test, + y_train, + y_test, + ) = train_test_split(X, y, test_size=0.5, stratify=y, random_state=0) + + classifier = LogisticRegression() + self.y_score = classifier.fit(X_train, y_train).predict_proba(X_test) + + label_binarizer = LabelBinarizer().fit(y_train) + + self.y_test = np.argmax(label_binarizer.transform(y_test), -1) + + thresholds = np.unique(self.y_score) + self.dim = 3 + self.curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + def test_sklearn(self): + + for class_idx in range(self.dim): + + fpr_sk, tpr_sk, thresholds = metrics.roc_curve( + self.y_test == class_idx, self.y_score[..., class_idx] + ) + + self.curve = StreamingMetrics( + thresholds=thresholds[1:], + num_classes=self.dim, + ) + self.curve.reset() + self.curve.update(y_true=self.y_test, y_score=self.y_score) + # + tpr_ours = self.curve.calc_metric( + metric=tpr, method=AggregationMethod.ONE_VS_ALL + ).squeeze()[:, class_idx] + fpr_ours = self.curve.calc_metric( + metric=fpr, method=AggregationMethod.ONE_VS_ALL + ).squeeze()[:, class_idx] + + np.testing.assert_allclose(tpr_ours[:-1], tpr_sk) + np.testing.assert_allclose(fpr_ours[:-1], fpr_sk) + self.assertEqual(tpr_sk.shape[0], tpr_ours.shape[0] - 1) + + auc_sk = metrics.auc(fpr_sk, tpr_sk) + auc_custom = auc(fpr_ours, tpr_ours) + auc_curve = self.curve.auc(class_index=class_idx) + + self.assertTrue(np.isclose(auc_custom, auc_sk)) + self.assertTrue(np.isclose(auc_sk, auc_curve)) + + self.curve.false_positives() + + +class TestStreamingMetrics(unittest.TestCase): + def setUp(self): + iris = load_iris() + X, y = iris.data, iris.target + y = iris.target_names[y] + + random_state = np.random.RandomState(0) + n_samples, n_features = X.shape + X = np.concatenate( + [X, random_state.randn(n_samples, 200 * n_features)], axis=1 + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, stratify=y, random_state=0 + ) + + classifier = LogisticRegression(max_iter=1000) + self.y_score = classifier.fit(X_train, y_train).predict_proba(X_test) + + label_binarizer = LabelBinarizer().fit(y_train) + self.y_test = np.argmax(label_binarizer.transform(y_test), -1) + + thresholds = np.unique(self.y_score) + self.dim = 3 + self.curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + # check that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + self.curve.update(self.y_test[:half], self.y_score[:half]) + self.curve.update(self.y_test[half:], self.y_score[half:]) + + def test_total(self): + new_curve = StreamingMetrics( + num_thresholds=100, + num_classes=self.dim, + ) + + self.assertEqual(new_curve._total().shape, (100, self.dim)) + np.testing.assert_allclose( + new_curve._total(), np.zeros_like(new_curve._total()) + ) + + new_curve.update(self.y_test, self.y_score) + new_curve.update(self.y_test, self.y_score) + self.assertEqual(new_curve._total().shape, (100, self.dim)) + + np.testing.assert_allclose( + new_curve._total(), + 2 * self.y_test.shape[0] * np.ones_like(new_curve._total()), + ) + + def test_confusion_matrix(self): + for class_idx in range(self.dim): + y_true = self.y_test == class_idx + + for threshold in self.curve.thresholds: + y_pred = self.y_score[:, class_idx] >= threshold + + # sklearn has the confusion matrix flipped + confm_ref = np.flip(confusion_matrix(y_true, y_pred)) + + computed_confm = self.curve.confusion_matrix[ + self.curve.thresholds.tolist().index(threshold), class_idx + ] + np.testing.assert_array_equal(computed_confm, confm_ref) + + def test_precision_recall_curve(self): + for class_idx in range(self.dim): + precision, recall, thresholds = sk_precision_recall_curve( + self.y_test == class_idx, self.y_score[:, class_idx] + ) + + new_curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + # check that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + new_curve.update(self.y_test[:half], self.y_score[:half]) + new_curve.update(self.y_test[half:], self.y_score[half:]) + stream_prec, stream_recall, stream_thresholds = ( + new_curve.precision_recall_curve(class_index=class_idx) + ) + np.testing.assert_almost_equal(stream_thresholds[1:], thresholds) + np.testing.assert_almost_equal(precision[:1], stream_prec[:1]) + np.testing.assert_almost_equal(recall, stream_recall) + + def test_roc_curve(self): + for class_idx in range(self.dim): + _fpr, _tpr, thresholds = sk_roc_curve( + self.y_test == class_idx, self.y_score[:, class_idx] + ) + + new_curve = StreamingMetrics( + thresholds=thresholds[1:], + num_classes=self.dim, + ) + + # ensure that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + new_curve.update(self.y_test[:half], self.y_score[:half]) + new_curve.update(self.y_test[half:], self.y_score[half:]) + + streaming_fpr, streaming_tpr, _thr = new_curve.roc_curve( + class_index=class_idx + ) + np.testing.assert_almost_equal(_fpr, streaming_fpr[:-1]) + np.testing.assert_almost_equal(_tpr, streaming_tpr[:-1]) + + +class TestStreamingIdentities(unittest.TestCase): + def setUp(self): + iris = load_iris() + X, y = iris.data, iris.target + y = iris.target_names[y] + + random_state = np.random.RandomState(0) + n_samples, n_features = X.shape + X = np.concatenate( + [X, random_state.randn(n_samples, 200 * n_features)], axis=1 + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, stratify=y, random_state=0 + ) + + classifier = LogisticRegression(max_iter=1000) + self.y_score = classifier.fit(X_train, y_train).predict_proba(X_test) + + label_binarizer = LabelBinarizer().fit(y_train) + self.y_test = np.argmax(label_binarizer.transform(y_test), -1) + + thresholds = np.unique(self.y_score) + self.dim = 3 + self.curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + # check that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + self.curve.update(self.y_test[:half], self.y_score[:half]) + self.curve.update(self.y_test[half:], self.y_score[half:]) + + def test_total(self): + total = self.curve._total().squeeze() + + tp = self.curve.true_positives() + fp = self.curve.false_positives() + tn = self.curve.true_negatives() + fn = self.curve.false_negatives() + + pp = self.curve.predicted_positives() + pn = self.curve.predicted_negatives() + p = self.curve.positives() + n = self.curve.negatives() + + np.testing.assert_array_equal(total, tp + fp + tn + fn) + np.testing.assert_array_equal(p, tp + fn) + np.testing.assert_array_equal(n, tn + fp) + np.testing.assert_array_equal(pp, tp + fp) + np.testing.assert_array_equal(pn, tn + fn) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/utils/test_plots.py b/tests/utils/test_plots.py new file mode 100644 index 0000000..080f113 --- /dev/null +++ b/tests/utils/test_plots.py @@ -0,0 +1,117 @@ +import unittest +from typing import Tuple, List +import matplotlib.pyplot as plt +import math +from matplotlib.testing.decorators import cleanup +import numpy as np + +from streamauc.plot_util import create_square_subplots, plot_curve_and_auc + + +class TestCreateSquareSubplots(unittest.TestCase): + + def test_correct_number_of_subplots(self): + num_subplots = 7 + fig, axs = create_square_subplots(num_subplots) + self.assertEqual( + len(axs), + num_subplots, + "The number of subplots created" + " does not match the requested number.", + ) + + def test_subplot_layout(self): + num_subplots = 7 + fig, axs = create_square_subplots(num_subplots) + num_cols = math.ceil(math.sqrt(num_subplots)) + num_rows = math.ceil(num_subplots / num_cols) + + self.assertEqual( + len(axs), + num_subplots, + "The number of axes in the figure " + "does not match the requested number.", + ) + + def test_unused_subplots_hidden(self): + num_subplots = 5 + fig, axs = create_square_subplots(num_subplots) + + num_cols = math.ceil(math.sqrt(num_subplots)) + num_rows = math.ceil(num_subplots / num_cols) + + total_subplots = num_rows * num_cols + for i in range(num_subplots, total_subplots): + with self.assertRaises( + IndexError, msg=f"Axes index {i} should not exist." + ): + axs[i] + + visible_axes = axs + self.assertEqual( + len(visible_axes), + num_subplots, + "There are unused subplots that are not hidden.", + ) + + def test_single_subplot(self): + num_subplots = 1 + fig, axs = create_square_subplots(num_subplots) + self.assertEqual( + len(axs), + num_subplots, + "The number of subplots created does not match the requested number.", + ) + self.assertIsInstance( + axs[0], + plt.Axes, + "The single subplot is not an instance of plt.Axes.", + ) + + def test_empty_case(self): + num_subplots = 0 + with self.assertRaises( + ValueError, msg="Creating 0 subplots should raise a ValueError." + ): + create_square_subplots(num_subplots) + + +class TestPlotcurveAUC(unittest.TestCase): + + @cleanup + def test_single_class_plot(self): + fpr = np.array([0.0, 0.1, 0.2, 0.3, 1.0]) + tpr = np.array([0.0, 0.4, 0.6, 0.8, 1.0]) + thresholds = np.array([1.0, 0.9, 0.8, 0.7, 0.0]) + + fig = plot_curve_and_auc(fpr, tpr, thresholds) + self.assertIsInstance(fig, plt.Figure) + plt.close(fig) + + @cleanup + def test_multi_class_plot(self): + fpr = np.array( + [[0.0, 0.1, 0.2, 0.3, 1.0], [0.0, 0.2, 0.4, 0.6, 1.0]] + ).T + tpr = np.array( + [[0.0, 0.4, 0.6, 0.8, 1.0], [0.0, 0.5, 0.7, 0.85, 1.0]] + ).T + thresholds = np.array([1.0, 0.9, 0.8, 0.7, 0.0]) + class_names = ["Class 1", "Class 2"] + + fig = plot_curve_and_auc(fpr, tpr, thresholds, class_names=class_names) + self.assertIsInstance(fig, plt.Figure) + plt.close(fig) + + @cleanup + def test_invalid_fpr_shape(self): + fpr = np.array([[[0.0, 0.1, 0.2, 0.3, 1.0]]]) + tpr = np.array([[[0.0, 0.4, 0.6, 0.8, 1.0]]]) + thresholds = np.array([1.0, 0.9, 0.8, 0.7, 0.0]) + + with self.assertRaises(NotImplementedError): + plot_curve_and_auc(fpr, tpr, thresholds) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py new file mode 100644 index 0000000..ef6c10d --- /dev/null +++ b/tests/utils/test_util.py @@ -0,0 +1,71 @@ +import unittest + +from streamauc.utils import _all_equal + + +class TestAllEqual(unittest.TestCase): + def test_all_equal(self): + # Test case with all equal elements + self.assertTrue( + _all_equal([1, 1, 1, 1]), + "All elements are equal, " "should return True", + ) + self.assertTrue( + _all_equal(["a", "a", "a", "a"]), + "All elements are equal, " "should return True", + ) + self.assertTrue( + _all_equal([True, True, True]), + "All elements are equal, " "should return True", + ) + + def test_not_all_equal(self): + # Test case with not all equal elements + self.assertFalse( + _all_equal([1, 2, 1, 1]), + "Not all elements are equal, " "should return False", + ) + self.assertFalse( + _all_equal(["a", "b", "a", "a"]), + "Not all elements are equal, " "should return False", + ) + self.assertFalse( + _all_equal([True, False, True]), + "Not all elements are equal, " "should return False", + ) + + def test_empty_iterable(self): + # Test case with an empty list + self.assertTrue(_all_equal([]), "Empty list, should return True") + + def test_single_element(self): + # Test case with a single element + self.assertTrue( + _all_equal([1]), "Single element list, should return True" + ) + self.assertTrue( + _all_equal(["a"]), "Single element list, should return True" + ) + + def test_mixed_types(self): + # Test case with different types + self.assertFalse( + _all_equal([1, "1", 1.0]), "Different types, should return False" + ) + self.assertTrue( + _all_equal([1.0, 1.0, 1.0]), + "All floats are equal, should return True", + ) + + def test_large_input(self): + # Test case with large input + large_list = [1] * 1000000 + self.assertTrue( + _all_equal(large_list), + "Large list with all equal elements, " "should return True", + ) + large_list[999999] = 2 + self.assertFalse( + _all_equal(large_list), + "Large list with one different element, " "should return False", + ) From 6d95c383c38815512593fc396de67aee37832bc3 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:25:59 +0200 Subject: [PATCH 02/12] removed old code --- streamauc/metrics/_f1_score.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/streamauc/metrics/_f1_score.py b/streamauc/metrics/_f1_score.py index 003aace..cf809e5 100644 --- a/streamauc/metrics/_f1_score.py +++ b/streamauc/metrics/_f1_score.py @@ -39,11 +39,6 @@ def f1_onevsall( check_confusion_matrix_entries(fn, tp, fp) _f1 = (2 * tp) / (2 * tp + fp + fn + 1e-12) - # _f1 = np.divide( - # 2 * tp, - # (2 * tp + fp + fn), - # where=(2 * tp + fp + fn) != 0, - # ) return _f1 From 2544934ee10477b12e81ae9240b81071192bbcd1 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:30:26 +0200 Subject: [PATCH 03/12] added readme --- README.md | 220 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 160 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 9a3fd04..983af97 100644 --- a/README.md +++ b/README.md @@ -1,85 +1,185 @@ +# streamauc -# Python Project Template +[![codecov](https://codecov.io/gh/FabricioArendTorres/streamAUC/branch/main/graph/badge.svg?token=streamAUC_token_here)](https://codecov.io/gh/FabricioArendTorres/streamAUC) +[![CI](https://github.com/FabricioArendTorres/streamAUC/actions/workflows/main.yml/badge.svg)](https://github.com/FabricioArendTorres/streamAUC/actions/workflows/main.yml) -A low dependency and really simple to start project template for Python Projects. +## Multi-Class Classification Metrics from data streams and minibatches -See also -- [Flask-Project-Template](https://github.com/rochacbruno/flask-project-template/) for a full feature Flask project including database, API, admin interface, etc. -- [FastAPI-Project-Template](https://github.com/rochacbruno/fastapi-project-template/) The base to start an openapi project featuring: SQLModel, Typer, FastAPI, JWT Token Auth, Interactive Shell, Management Commands. +A low dependency python package for keeping track of classification metrics +such as AUC given probabilistic outputs. -### HOW TO USE THIS TEMPLATE +In essence, the package keeps track of one-vs-all confusion matrices for each +class for a range of thresholds. +This allows a minibatch based updating of the things such as ROC or +Precision-Recall curves, without having to store all the predictions. +Metrics can then be computed either in a one-vs-all fashion, or by micro- +or macro averaging. -> **DO NOT FORK** this is meant to be used from **[Use this template](https://github.com/rochacbruno/python-project-template/generate)** feature. +My main usage is for multiclass semantic segmentation, where the train and +test data becomes rather large for pixel-wise performance metrics. -1. Click on **[Use this template](https://github.com/rochacbruno/python-project-template/generate)** -3. Give a name to your project - (e.g. `my_awesome_project` recommendation is to use all lowercase and underscores separation for repo names.) -3. Wait until the first run of CI finishes - (Github Actions will process the template and commit to your new repo) -4. If you want [codecov](https://about.codecov.io/sign-up/) Reports and Automatic Release to [PyPI](https://pypi.org) - On the new repository `settings->secrets` add your `PYPI_API_TOKEN` and `CODECOV_TOKEN` (get the tokens on respective websites) -4. Read the file [CONTRIBUTING.md](CONTRIBUTING.md) -5. Then clone your new project and happy coding! +This package supports a range of classical performance metrics, such as: +- TPR, FNR, FPR, TNR, Accuracy, F1-Score, Jaccard Index, ... +- Corresponding curves, such as Precision-Recall (PR) curves or ROC curves. +- AUC of ROC and PR curves, or any combination of two metrics you want. +- One-vs-all, micro, or macro averaging of metrics for a set of predefined + thresholds. -> **NOTE**: **WAIT** until first CI run on github actions before cloning your new project. +## Lightweight, tested, and permissive License -### What is included on this template? +- Only Numpy and Matplotlib are requirements. +- High Test Coverage: Metrics are unit tested against sklearn metrics. +- Permissive License: Licensed under Apache 2.0. -- ๐Ÿ–ผ๏ธ Templates for starting multiple application types: - * **Basic low dependency** Python program (default) [use this template](https://github.com/rochacbruno/python-project-template/generate) - * **Flask** with database, admin interface, restapi and authentication [use this template](https://github.com/rochacbruno/flask-project-template/generate). - **or Run `make init` after cloning to generate a new project based on a template.** -- ๐Ÿ“ฆ A basic [setup.py](setup.py) file to provide installation, packaging and distribution for your project. - Template uses setuptools because it's the de-facto standard for Python packages, you can run `make switch-to-poetry` later if you want. -- ๐Ÿค– A [Makefile](Makefile) with the most useful commands to install, test, lint, format and release your project. -- ๐Ÿ“ƒ Documentation structure using [mkdocs](http://www.mkdocs.org) -- ๐Ÿ’ฌ Auto generation of change log using **gitchangelog** to keep a HISTORY.md file automatically based on your commit history on every release. -- ๐Ÿ‹ A simple [Containerfile](Containerfile) to build a container image for your project. - `Containerfile` is a more open standard for building container images than Dockerfile, you can use buildah or docker with this file. -- ๐Ÿงช Testing structure using [pytest](https://docs.pytest.org/en/latest/) -- โœ… Code linting using [flake8](https://flake8.pycqa.org/en/latest/) -- ๐Ÿ“Š Code coverage reports using [codecov](https://about.codecov.io/sign-up/) -- ๐Ÿ›ณ๏ธ Automatic release to [PyPI](https://pypi.org) using [twine](https://twine.readthedocs.io/en/latest/) and github actions. -- ๐ŸŽฏ Entry points to execute your program using `python -m ` or `$ streamauc` with basic CLI argument parsing. -- ๐Ÿ”„ Continuous integration using [Github Actions](.github/workflows/) with jobs to lint, test and release your project on Linux, Mac and Windows environments. +## Usage +Below you can find pseudocode for the usage of this package. +For a more comprehensive and self-consistent example, see `examples/example.py`. -> Curious about architectural decisions on this template? read [ABOUT_THIS_TEMPLATE.md](ABOUT_THIS_TEMPLATE.md) -> If you want to contribute to this template please open an [issue](https://github.com/rochacbruno/python-project-template/issues) or fork and send a PULL REQUEST. +### Keep track of confusion matrices at many thresolds -[โค๏ธ Sponsor this project](https://github.com/sponsors/rochacbruno/) +```py +import numpy as np - +from streamauc import StreamingMetrics, AggregationMethod ---- -# streamauc +# Select the number of thresholds for which we want to keep track of results. +stream_metrics = StreamingMetrics( + thresholds=np.linspace(0, 1, 200), + num_classes=10, +) -[![codecov](https://codecov.io/gh/FabricioArendTorres/streamAUC/branch/main/graph/badge.svg?token=streamAUC_token_here)](https://codecov.io/gh/FabricioArendTorres/streamAUC) -[![CI](https://github.com/FabricioArendTorres/streamAUC/actions/workflows/main.yml/badge.svg)](https://github.com/FabricioArendTorres/streamAUC/actions/workflows/main.yml) +while youhavedata: + y_true = ... # true classes, shape (-1,) or one-hot-encoded (-1,num_classes) + pred_prob_y = ... # indicating class probabilities, shape (-1, num_classes), + stream_metrics.update(y_true=y_true, y_score=pred_prob_y) -Awesome streamauc created by FabricioArendTorres +## get 1-vs-all confusion matrix at all thresholds +confm = stream_metrics.confusion_matrix +# confm is of shape (num_thresholds, num_classes, 2, 2) -## Install it from PyPI +## get metrics at all thresholds +tp = stream_metrics.true_positives() # is of shape (num_threholds, num_classes) -```bash -pip install streamauc -``` +fpr, tpr, thresholds = stream_metrics.roc_curve( + AggregationMethod.ONE_VS_ALL) # fpr and tpr are of shape (num_thresholds, num_classes) -## Usage -```py -from streamauc import BaseClass -from streamauc import base_function +fpr, tpr, thresholds = stream_metrics.precision_recall_curve( + AggregationMethod.MACRO) # fpr and tpr are of shape (num_thresholds, ) + +# reset before updating with new data +stream_metrics.reset() -BaseClass().base_method() -base_function() ``` -```bash -$ python -m streamauc -#or -$ streamauc +### In a training loop +```py +import matplotlib.pyplot as plt + +from streamauc import StreamingMetrics, AggregationMethod, auc +from streamauc import metrics + +# Select the number of thresholds for which we want to keep track of results. +stream_metrics = StreamingMetrics( + num_thresholds=100, + num_classes=3, +) + +# Whatever your model may be, you need probabilities for the +# defined number of classes. +model = ... +yourdataiterator = ... + +for epoch in range(100): + ... # do your training step + + for mb_x, mb_y in yourdataiterator: + pred_prob_y = model.predict_proba(mb_x) # of shape (-1, num_classes) + # mb_y can be onehot encoded (-1, num_classes) or a flat integer array (-1,) + stream_metrics.update(y_true=mb_y, y_score=pred_prob_y) + + # compute metrics you want + _auc_macro = stream_metrics.auc(metrics.recall, + metrics.precision, + method=AggregationMethod.MACRO) + f1_for_all_thresholds = stream_metrics.calc_metric(metric=metrics.f1_score) + + # Plot all 1-vs-all/micro-averaged/macro-averaged Precision Recall Curves + fig = stream_metrics.precision_recall_curve(method=AggregationMethod. + ONE_VS_ALL) + fig.savefig(f"PR_one_vs_all_{epoch}.png") + plt.close(fig) + + # reset the tracker for the next epoch + stream_metrics.reset() ``` -## Development -Read the [CONTRIBUTING.md](CONTRIBUTING.md) file. + + +## Things to note + +### Curves and AUC are only approximate +StreamAUC works by keeping track of confusion matrices at different +thresholds, which are defined at the beginning. That is, the resulting +curves and AUC are by construction always approximations. + +This should however not be too limiting for any application with large data +sets, as in that case the number of unique thresholds becomes infeasible in +any case. + +### Precision-Recall Curve: Definition of precision when recall is zero +There are different conventions regarding the precision when there are no +positive predictions, which occurs at the left-most point of the +precision-recall curve corresponding to a threshold of 1. +Technically, its undefined, since we have TP/(TP+FP)=0/0. +Scikit-learn then defines it as 1, for the sake of nicer PR curves. +This package defines it as 0, as a value of 1 seems misleading in my opinion. + +## Custom Metrics +It's straight-forward to add custom metrics to this package, just define a +function with the following interface, which can then be passed as Callable to +`StreamingMetrics.calc_metric`, `StreamingMetrics.auc`. +The basic metrics (TP, FN, FP, TN) are always in the shape of ` +(num_thresholds, num_classes)`, with e.g. `TP[:,2]` corresponding to the +number of true positives at each threshold in a one-vs-all setting for the +class with index 2. + +See for example the F1 metric implementation for the required interface: +``` +from streamauc.utils import AggregationMethod, check_confusion_matrix_entries + +def custom_f1_score( + tp: np.ndarray, + fn: np.ndarray, + fp: np.ndarray, + tn: np.ndarray, + method: AggregationMethod = AggregationMethod.MACRO, + class_index: Optional[int] = None, + check_inputs: bool = True, +): + + if check_inputs: + # do some optional checks for valid shapes etc. + check_confusion_matrix_entries(tp, fn, fp, tn) + + if method == AggregationMethod.MICRO: + tp_sum = np.sum(tp, axis=-1) + fn_sum = np.sum(fn, axis=-1) + fp_sum = np.sum(fp, axis=-1) + _f1 = ((2 * tp_sum) / (2 * tp_sum + fp_sum + fn_sum + 1e-12)) + _f1 = _f1[..., class_index] + elif method == AggregationMethod.MACRO: + _f1 = ((2 * tp) / (2 * tp + fp + fn + 1e-12)).mean(-1) + elif method == AggregationMethod.ONE_VS_ALL: + _f1 = ((2 * tp) / (2 * tp + fp + fn + 1e-12))[..., class_index] + else: + raise ValueError( + f"Method must one of {[e.value for e in AggregationMethod]}. " + f"Got {method}." + ) + return _f1 +``` + + + From 8a56de37908b33c494672c8ae3a2f481324ffe73 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:35:41 +0200 Subject: [PATCH 04/12] chore: removed old files related to the template --- ABOUT_THIS_TEMPLATE.md | 198 ----------------------------------------- CONTRIBUTING.md | 23 +---- Containerfile | 5 -- gen_ref_pages.py | 26 ------ mkdocs.yml | 8 -- setup.py | 3 - 6 files changed, 1 insertion(+), 262 deletions(-) delete mode 100644 ABOUT_THIS_TEMPLATE.md delete mode 100644 Containerfile delete mode 100644 gen_ref_pages.py delete mode 100644 mkdocs.yml diff --git a/ABOUT_THIS_TEMPLATE.md b/ABOUT_THIS_TEMPLATE.md deleted file mode 100644 index a145ddf..0000000 --- a/ABOUT_THIS_TEMPLATE.md +++ /dev/null @@ -1,198 +0,0 @@ -# About this template - -Hi, I created this template to help you get started with a new project. - -I have created and maintained a number of python libraries, applications and -frameworks and during those years I have learned a lot about how to create a -project structure and how to structure a project to be as modular and simple -as possible. - -Some decisions I have made while creating this template are: - - - Create a project structure that is as modular as possible. - - Keep it simple and easy to maintain. - - Allow for a lot of flexibility and customizability. - - Low dependency (this template doesn't add dependencies) - -## Structure - -Lets take a look at the structure of this template: - -```text -โ”œโ”€โ”€ Containerfile # The file to build a container using buildah or docker -โ”œโ”€โ”€ CONTRIBUTING.md # Onboarding instructions for new contributors -โ”œโ”€โ”€ docs # Documentation site (add more .md files here) -โ”‚ย ย  โ””โ”€โ”€ index.md # The index page for the docs site -โ”œโ”€โ”€ .github # Github metadata for repository -โ”‚ย ย  โ”œโ”€โ”€ release_message.sh # A script to generate a release message -โ”‚ย ย  โ””โ”€โ”€ workflows # The CI pipeline for Github Actions -โ”œโ”€โ”€ .gitignore # A list of files to ignore when pushing to Github -โ”œโ”€โ”€ HISTORY.md # Auto generated list of changes to the project -โ”œโ”€โ”€ LICENSE # The license for the project -โ”œโ”€โ”€ Makefile # A collection of utilities to manage the project -โ”œโ”€โ”€ MANIFEST.in # A list of files to include in a package -โ”œโ”€โ”€ mkdocs.yml # Configuration for documentation site -โ”œโ”€โ”€ streamauc # The main python package for the project -โ”‚ย ย  โ”œโ”€โ”€ base.py # The base module for the project -โ”‚ย ย  โ”œโ”€โ”€ __init__.py # This tells Python that this is a package -โ”‚ย ย  โ”œโ”€โ”€ __main__.py # The entry point for the project -โ”‚ย ย  โ””โ”€โ”€ VERSION # The version for the project is kept in a static file -โ”œโ”€โ”€ README.md # The main readme for the project -โ”œโ”€โ”€ setup.py # The setup.py file for installing and packaging the project -โ”œโ”€โ”€ requirements.txt # An empty file to hold the requirements for the project -โ”œโ”€โ”€ requirements-test.txt # List of requirements for testing and devlopment -โ”œโ”€โ”€ setup.py # The setup.py file for installing and packaging the project -โ””โ”€โ”€ tests # Unit tests for the project (add mote tests files here) - โ”œโ”€โ”€ conftest.py # Configuration, hooks and fixtures for pytest - โ”œโ”€โ”€ __init__.py # This tells Python that this is a test package - โ””โ”€โ”€ test_base.py # The base test case for the project -``` - -## FAQ - -Frequent asked questions. - -### Why this template is not using [Poetry](https://python-poetry.org/) ? - -I really like Poetry and I think it is a great tool to manage your python projects, -if you want to switch to poetry, you can run `make switch-to-poetry`. - -But for this template I wanted to keep it simple. - -Setuptools is the most simple and well supported way of packaging a Python project, -it doesn't require extra dependencies and is the easiest way to install the project. - -Also, poetry doesn't have a good support for installing projects in development mode yet. - -### Why the `requirements.txt` is empty ? - -This template is a low dependency project, so it doesn't have any extra dependencies. -You can add new dependencies as you will or you can use the `make init` command to -generate a `requirements.txt` file based on the template you choose `flask, fastapi, click etc`. - -### Why there is a `requirements-test.txt` file ? - -This file lists all the requirements for testing and development, -I think the development environment and testing environment should be as similar as possible. - -Except those tools that are up to the developer choice (like ipython, ipdb etc). - -### Why the template doesn't have a `pyproject.toml` file ? - -It is possible to run `pip install https://github.com/name/repo/tarball/main` and -have pip to download the package direcly from Git repo. - -For that to work you need to have a `setup.py` file, and `pyproject.toml` is not -supported for that kind of installation. - -I think it is easier for example you want to install specific branch or tag you can -do `pip install https://github.com/name/repo/tarball/{TAG|REVISON|COMMIT}` - -People automating CI for your project will be grateful for having a setup.py file - -### Why isn't this template made as a cookiecutter template? - -I really like [cookiecutter](https://github.com/cookiecutter/cookiecutter) and it is a great way to create new projects, -but for this template I wanted to use the Github `Use this template` button, -to use this template doesn't require to install extra tooling such as cookiecutter. - -Just click on [Use this template](https://github.com/rochacbruno/python-project-template/generate) and you are good to go. - -The substituions are done using github actions and a simple sed script. - -### Why `VERSION` is kept in a static plain text file? - -I used to have my version inside my main module in a `__version__` variable, then -I had to do some tricks to read that version variable inside the setuptools -`setup.py` file because that would be available only after the installation. - -I decided to keep the version in a static file because it is easier to read from -wherever I want without the need to install the package. - -e.g: `cat streamauc/VERSION` will get the project version without harming -with module imports or anything else, it is useful for CI, logs and debugging. - -### Why to include `tests`, `history` and `Containerfile` as part of the release? - -The `MANIFEST.in` file is used to include the files in the release, once the -project is released to PyPI all the files listed on MANIFEST.in will be included -even if the files are static or not related to Python. - -Some build systems such as RPM, DEB, AUR for some Linux distributions, and also -internal repackaging systems tends to run the tests before the packaging is performed. - -The Containerfile can be useful to provide a safer execution environment for -the project when running on a testing environment. - -I added those files to make it easier for packaging in different formats. - -### Why conftest includes a go_to_tmpdir fixture? - -When your project deals with file system operations, it is a good idea to use -a fixture to create a temporary directory and then remove it after the test. - -Before executing each test pytest will create a temporary directory and will -change the working directory to that path and run the test. - -So the test can create temporary artifacts isolated from other tests. - -After the execution Pytest will remove the temporary directory. - -### Why this template is not using [pre-commit](https://pre-commit.com/) ? - -pre-commit is an excellent tool to automate checks and formatting on your code. - -However I figured out that pre-commit adds extra dependency and it an entry barrier -for new contributors. - -Having the linting, checks and formatting as simple commands on the [Makefile](Makefile) -makes it easier to undestand and change. - -Once the project is bigger and complex, having pre-commit as a dependency can be a good idea. - -### Why the CLI is not using click? - -I wanted to provide a simple template for a CLI application on the project main entry point -click and typer are great alternatives but are external dependencies and this template -doesn't add dependencies besides those used for development. - -### Why this doesn't provide a full example of application using Flask or Django? - -as I said before, I want it to be simple and multipurpose, so I decided to not include -external dependencies and programming design decisions. - -It is up to you to decide if you want to use Flask or Django and to create your application -the way you think is best. - -This template provides utilities in the Makefile to make it easier to you can run: - -```bash -$ make init -Which template do you want to apply? [flask, fastapi, click, typer]? > flask -Generating a new project with Flask ... -``` - -Then the above will download the Flask template and apply it to the project. - -## The Makefile - -All the utilities for the template and project are on the Makefile - -```bash -โฏ make -Usage: make - -Targets: -help: ## Show the help. -install: ## Install the project in dev mode. -fmt: ## Format code using black & isort. -lint: ## Run pep8, black, mypy linters. -test: lint ## Run tests and generate coverage report. -watch: ## Run tests on every change. -clean: ## Clean unused files. -virtualenv: ## Create a virtual environment. -release: ## Create a new tag for release. -docs: ## Build the documentation. -switch-to-poetry: ## Switch to poetry package manager. -init: ## Initialize the project based on an application template. -``` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 79c792d..12838ba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # How to develop on this project -streamauc welcomes contributions from the community. +streamAUC welcomes contributions from the community. **You need PYTHON3!** @@ -90,24 +90,3 @@ docs: ## Build the documentation. switch-to-poetry: ## Switch to poetry package manager. init: ## Initialize the project based on an application template. ``` - -## Making a new release - -This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z` -Every time a new tag is created and pushed to the remote repo, github actions will -automatically create a new release on github and trigger a release on PyPI. - -For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, -this token can be generated on [pypi.org](https://pypi.org/account/). - -To trigger a new release all you need to do is. - -1. If you have changes to add to the repo - * Make your changes following the steps described above. - * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/). -2. Run the tests to ensure everything is working. -4. Run `make release` to create a new tag and push it to the remote repo. - -the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked. - -> **CAUTION**: The make release will change local changelog files and commit all the unstaged changes you have. diff --git a/Containerfile b/Containerfile deleted file mode 100644 index b64a251..0000000 --- a/Containerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:3.7-slim -COPY . /app -WORKDIR /app -RUN pip install . -CMD ["streamauc"] diff --git a/gen_ref_pages.py b/gen_ref_pages.py deleted file mode 100644 index edb65db..0000000 --- a/gen_ref_pages.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Generate the code reference pages.""" - -from pathlib import Path - -import mkdocs_gen_files - -root = Path(__file__).parent.parent -src = root / "streamauc" - -for path in sorted(src.rglob("*.py")): - module_path = path.relative_to(src).with_suffix("") - doc_path = path.relative_to(src).with_suffix(".md") - full_doc_path = Path("reference", doc_path) - - parts = tuple(module_path.parts) - - if parts[-1] == "__init__": - parts = parts[:-1] - elif parts[-1] == "__main__": - continue - - with mkdocs_gen_files.open(full_doc_path, "w") as fd: - identifier = ".".join(parts) - print("::: " + identifier, file=fd) - - mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index f4ec486..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,8 +0,0 @@ -site_name: streamauc -theme: readthedocs - -plugins: -- search -- gen-files: - scripts: - - gen_ref_pages.py \ No newline at end of file diff --git a/setup.py b/setup.py index b337a85..e86c7c9 100644 --- a/setup.py +++ b/setup.py @@ -39,8 +39,5 @@ def read_requirements(path): author="FabricioArendTorres", packages=find_packages(exclude=["tests", ".github"]), install_requires=read_requirements("requirements.txt"), - entry_points={ - "console_scripts": ["streamauc = streamauc.__main__:main"] - }, extras_require={"test": read_requirements("requirements-test.txt")}, ) From f0742795b0b543da7db1a6cdf201105e670e9c7c Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:01:22 +0200 Subject: [PATCH 05/12] chore: Added installation instructions --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 983af97..15b55d2 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,19 @@ This package supports a range of classical performance metrics, such as: - High Test Coverage: Metrics are unit tested against sklearn metrics. - Permissive License: Licensed under Apache 2.0. +## Installation + +### Pypi Current Release +```bash +pip install streamauc +``` + +### Latest Version from Github +```bash +pip install git+https://github.com/FabricioArendTorres/streamAUC.git +``` + + ## Usage Below you can find pseudocode for the usage of this package. For a more comprehensive and self-consistent example, see `examples/example.py`. From fdc010c98182ada204bb47e6e51347b34863e017 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:02:31 +0200 Subject: [PATCH 06/12] chore: Fixed example code highlighting --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 15b55d2..7ce674e 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,10 @@ number of true positives at each threshold in a one-vs-all setting for the class with index 2. See for example the F1 metric implementation for the required interface: -``` +```python +from typing import Optional +import numpy as np + from streamauc.utils import AggregationMethod, check_confusion_matrix_entries def custom_f1_score( From 38494e9a62e251ec7151e84866cdb2f55b5bde24 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:03:28 +0200 Subject: [PATCH 07/12] chore: Fixed example code highlighting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7ce674e..07d8574 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ stream_metrics.reset() ``` -### In a training loop +### Track metrics in a minibatch based training loop ```py import matplotlib.pyplot as plt From 010fc726f38cd7f9a7fc7ba494f07965f20a6d9a Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:03:57 +0200 Subject: [PATCH 08/12] feat: Implemented vectorized update of confusion matrices --- streamauc/streaming_metrics.py | 51 ++++++++------ tests/test_streaming_metrics.py | 119 +++++++++++++++++++++++++++++++- 2 files changed, 144 insertions(+), 26 deletions(-) diff --git a/streamauc/streaming_metrics.py b/streamauc/streaming_metrics.py index 5f413f6..072d159 100644 --- a/streamauc/streaming_metrics.py +++ b/streamauc/streaming_metrics.py @@ -47,19 +47,19 @@ def _validate_thresholds( class StreamingMetrics: """ - Class for computing metrics in a minibatch-wise, iterative, fashion. + Class for keeping track of metrics for many thresholds in a + minibatch-wise, iterative, fashion. Parameters ---------- num_thresholds : int, optional Number of thresholds to evaluate the curve. Default is 200. - curve_type : str, optional - Type of curve to compute, either "ROC" or "PR". Default is "PR". num_classes : int Number of classes in the multiclass setting. Must be >= 2. thresholds : list of float, optional - List of specific thresholds to evaluate the curve. - + List of specific thresholds to evaluate the metrics at. + A probability >= threshold is defined as a positive prediction for + the respective class. """ def __init__( @@ -140,7 +140,7 @@ def update( If the shapes of `y_true` and `y_pred` do not match. """ - y_true = np.squeeze(y_true) + y_true = np.squeeze(y_true).astype(int) y_score = np.squeeze(y_score) if check_inputs: @@ -149,6 +149,10 @@ def update( f"Unknown shape of y_true: {y_true.shape}," f"must be squeezable to either [-1, num_classes] or [-1]." ) + if y_true.ndim==2 and np.any( y_true.sum(-1)!=1): + raise ValueError( + "The provided one-hot encoding is invalid." + ) if y_score.ndim > 2: raise ValueError( f"Unknown shape of y_true: {y_true.shape}," @@ -164,24 +168,25 @@ def update( raise ValueError(f"Invalid shape of y_pred: {y_score.shape}") if y_true.ndim == 2 and y_true.shape[1] == self.num_classes: - y_true_argmax = np.argmax(y_true, -1) + y_onehot = y_true else: - y_true_argmax = y_true - - for threshold_idx, threshold in enumerate(self.thresholds): - for class_idx in range(self.num_classes): - pred_pos = y_score[:, class_idx] >= threshold - is_pos = y_true_argmax == class_idx - - tp = np.sum(pred_pos & is_pos) - fp = np.sum(pred_pos & (~is_pos)) - fn = np.sum((~pred_pos) & (is_pos)) - tn = np.sum((~pred_pos) & (~is_pos)) - - self._confusion_matrix[threshold_idx, class_idx, 0, 0] += tp - self._confusion_matrix[threshold_idx, class_idx, 1, 0] += fp - self._confusion_matrix[threshold_idx, class_idx, 1, 1] += tn - self._confusion_matrix[threshold_idx, class_idx, 0, 1] += fn + y_onehot = np.eye(self.num_classes, dtype=int)[y_true] + + # use numpy broadcasting to get predictions + pred_pos = y_score[np.newaxis, ...] >= self.thresholds.reshape(-1,1,1) + is_pos = y_onehot[np.newaxis, ...] + + # sum over the minibatch samples + tp = np.sum(pred_pos & is_pos, 1) + fp = np.sum(pred_pos & (~is_pos), 1) + fn = np.sum((~pred_pos) & (is_pos), 1) + tn = np.sum((~pred_pos) & (~is_pos), 1) + + # update confusion matrix entry + self._confusion_matrix[..., 0, 0] += tp + self._confusion_matrix[..., 1, 0] += fp + self._confusion_matrix[..., 1, 1] += tn + self._confusion_matrix[..., 0, 1] += fn def _total(self) -> np.ndarray: """ diff --git a/tests/test_streaming_metrics.py b/tests/test_streaming_metrics.py index c266e0a..e97d9b0 100644 --- a/tests/test_streaming_metrics.py +++ b/tests/test_streaming_metrics.py @@ -4,7 +4,7 @@ from streamauc.metrics import f1_score, tpr, fpr import numpy as np -from sklearn.datasets import load_iris +from sklearn.datasets import load_iris, load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn import metrics @@ -83,7 +83,8 @@ def test_reset(self): curve.confusion_matrix, expected_empty_confm ) - y_true = np.random.randint(0, 2, (10, curve.num_classes)) + y_true = np.random.randint(0, 2, (10,)) + y_true = np.eye(curve.num_classes)[y_true] y_pred = np.random.random((10, curve.num_classes)) y_pred = y_pred / y_pred.sum(-1, keepdims=True) @@ -132,7 +133,8 @@ def test_invalid_input(self): curve.update(y_true=y_true, y_score=y_pred) # should not throw any errors - y_true = np.random.randint(0, 2, (10, curve.num_classes, 1, 1, 1, 1)) + y_true = np.random.randint(0, 2, (10,)) + y_true = np.eye(curve.num_classes)[y_true][..., np.newaxis, np.newaxis] y_pred = np.random.randint(0, 2, (10, curve.num_classes)) curve.update(y_true=y_true, y_score=y_pred) @@ -250,6 +252,117 @@ def test_sklearn(self): class TestStreamingMetrics(unittest.TestCase): + def setUp(self): + cancer_ds = load_breast_cancer() + X, y = cancer_ds.data, cancer_ds.target + + random_state = np.random.RandomState(0) + n_samples, n_features = X.shape + X = np.concatenate( + [X, random_state.randn(n_samples, 200 * n_features)], axis=1 + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, stratify=y, random_state=0 + ) + + classifier = LogisticRegression(max_iter=1000) + self.y_score = classifier.fit(X_train, y_train).predict_proba(X_test) + + self.y_test = y_test + + thresholds = np.unique(self.y_score) + self.dim = 2 + self.curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + # check that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + self.curve.update(self.y_test[:half], self.y_score[:half]) + self.curve.update(self.y_test[half:], self.y_score[half:]) + + def test_total(self): + new_curve = StreamingMetrics( + num_thresholds=100, + num_classes=self.dim, + ) + + self.assertEqual(new_curve._total().shape, (100, self.dim)) + np.testing.assert_allclose( + new_curve._total(), np.zeros_like(new_curve._total()) + ) + + new_curve.update(self.y_test, self.y_score) + new_curve.update(self.y_test, self.y_score) + self.assertEqual(new_curve._total().shape, (100, self.dim)) + + np.testing.assert_allclose( + new_curve._total(), + 2 * self.y_test.shape[0] * np.ones_like(new_curve._total()), + ) + + def test_confusion_matrix(self): + for class_idx in range(self.dim): + y_true = self.y_test == class_idx + + for threshold in self.curve.thresholds: + y_pred = self.y_score[:, class_idx] >= threshold + + # sklearn has the confusion matrix flipped + confm_ref = np.flip(confusion_matrix(y_true, y_pred)) + + computed_confm = self.curve.confusion_matrix[ + self.curve.thresholds.tolist().index(threshold), class_idx + ] + np.testing.assert_array_equal(computed_confm, confm_ref) + + def test_precision_recall_curve(self): + for class_idx in range(self.dim): + precision, recall, thresholds = sk_precision_recall_curve( + self.y_test == class_idx, self.y_score[:, class_idx] + ) + + new_curve = StreamingMetrics( + thresholds=thresholds, + num_classes=self.dim, + ) + + # check that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + new_curve.update(self.y_test[:half], self.y_score[:half]) + new_curve.update(self.y_test[half:], self.y_score[half:]) + stream_prec, stream_recall, stream_thresholds = ( + new_curve.precision_recall_curve(class_index=class_idx) + ) + np.testing.assert_almost_equal(stream_thresholds[1:], thresholds) + np.testing.assert_almost_equal(precision[:1], stream_prec[:1]) + np.testing.assert_almost_equal(recall, stream_recall) + + def test_roc_curve(self): + for class_idx in range(self.dim): + _fpr, _tpr, thresholds = sk_roc_curve( + self.y_test == class_idx, self.y_score[:, class_idx] + ) + + new_curve = StreamingMetrics( + thresholds=thresholds[1:], + num_classes=self.dim, + ) + + # ensure that multiple updates have the same effect as one big.. + half = self.y_test.shape[0] // 2 + new_curve.update(self.y_test[:half], self.y_score[:half]) + new_curve.update(self.y_test[half:], self.y_score[half:]) + + streaming_fpr, streaming_tpr, _thr = new_curve.roc_curve( + class_index=class_idx + ) + np.testing.assert_almost_equal(_fpr, streaming_fpr[:-1]) + np.testing.assert_almost_equal(_tpr, streaming_tpr[:-1]) + + +class TestStreamingMetricsBinary(unittest.TestCase): def setUp(self): iris = load_iris() X, y = iris.data, iris.target From 6761493c68c21457e8d971cc8a58102e41bb2661 Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:04:15 +0200 Subject: [PATCH 09/12] chore: removed check_input from example --- examples/example.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/example.py b/examples/example.py index 173f26b..17a960a 100644 --- a/examples/example.py +++ b/examples/example.py @@ -72,7 +72,8 @@ mb_y = y_test[mb_size * i:mb_size * (i + 1)] y_pred = classifier.predict_proba(mb_X) - stream_metrics.update(y_true=mb_y, y_score=y_pred) + # remove check inputs for faster updates + stream_metrics.update(y_true=mb_y, y_score=y_pred, check_inputs=False) # ###### METRICS From 48392d524ab52db28fd28e993ddcca410e3409ea Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:06:49 +0200 Subject: [PATCH 10/12] chore: fmt --- streamauc/streaming_metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/streamauc/streaming_metrics.py b/streamauc/streaming_metrics.py index 072d159..6f6113e 100644 --- a/streamauc/streaming_metrics.py +++ b/streamauc/streaming_metrics.py @@ -149,10 +149,8 @@ def update( f"Unknown shape of y_true: {y_true.shape}," f"must be squeezable to either [-1, num_classes] or [-1]." ) - if y_true.ndim==2 and np.any( y_true.sum(-1)!=1): - raise ValueError( - "The provided one-hot encoding is invalid." - ) + if y_true.ndim == 2 and np.any(y_true.sum(-1) != 1): + raise ValueError("The provided one-hot encoding is invalid.") if y_score.ndim > 2: raise ValueError( f"Unknown shape of y_true: {y_true.shape}," @@ -173,7 +171,9 @@ def update( y_onehot = np.eye(self.num_classes, dtype=int)[y_true] # use numpy broadcasting to get predictions - pred_pos = y_score[np.newaxis, ...] >= self.thresholds.reshape(-1,1,1) + pred_pos = y_score[np.newaxis, ...] >= self.thresholds.reshape( + -1, 1, 1 + ) is_pos = y_onehot[np.newaxis, ...] # sum over the minibatch samples From c978c8b324422526f5a37c419cf8ab709024b41e Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:15:07 +0200 Subject: [PATCH 11/12] fix: remove import of matplotlib test utilities due to version incompatibility --- tests/utils/test_plots.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/utils/test_plots.py b/tests/utils/test_plots.py index 080f113..7ac52a1 100644 --- a/tests/utils/test_plots.py +++ b/tests/utils/test_plots.py @@ -2,7 +2,7 @@ from typing import Tuple, List import matplotlib.pyplot as plt import math -from matplotlib.testing.decorators import cleanup +# from matplotlib.testing.decorators import cleanup import numpy as np from streamauc.plot_util import create_square_subplots, plot_curve_and_auc @@ -78,7 +78,6 @@ def test_empty_case(self): class TestPlotcurveAUC(unittest.TestCase): - @cleanup def test_single_class_plot(self): fpr = np.array([0.0, 0.1, 0.2, 0.3, 1.0]) tpr = np.array([0.0, 0.4, 0.6, 0.8, 1.0]) @@ -88,7 +87,6 @@ def test_single_class_plot(self): self.assertIsInstance(fig, plt.Figure) plt.close(fig) - @cleanup def test_multi_class_plot(self): fpr = np.array( [[0.0, 0.1, 0.2, 0.3, 1.0], [0.0, 0.2, 0.4, 0.6, 1.0]] @@ -103,7 +101,6 @@ def test_multi_class_plot(self): self.assertIsInstance(fig, plt.Figure) plt.close(fig) - @cleanup def test_invalid_fpr_shape(self): fpr = np.array([[[0.0, 0.1, 0.2, 0.3, 1.0]]]) tpr = np.array([[[0.0, 0.4, 0.6, 0.8, 1.0]]]) From a4489dc0e5dad141fa7f5b5356cf8d01d332014f Mon Sep 17 00:00:00 2001 From: Fabricio Arend Torres <9096900+FabricioArendTorres@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:16:22 +0200 Subject: [PATCH 12/12] fix: Forgot to lint.. --- tests/utils/test_plots.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils/test_plots.py b/tests/utils/test_plots.py index 7ac52a1..df06115 100644 --- a/tests/utils/test_plots.py +++ b/tests/utils/test_plots.py @@ -2,6 +2,7 @@ from typing import Tuple, List import matplotlib.pyplot as plt import math + # from matplotlib.testing.decorators import cleanup import numpy as np