From 3edfe61b2d0112cb6323ba756080557ce0fcdd17 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Mon, 15 Jul 2024 09:46:45 -0400 Subject: [PATCH 01/14] chore: added new tool for dimension reduction --- .../dimension-reduction-tool/.bumpversion.cfg | 35 ++++++++++ .../dimension-reduction-tool/Dockerfile | 25 +++++++ transforms/dimension-reduction-tool/README.md | 40 ++++++++++++ transforms/dimension-reduction-tool/VERSION | 1 + .../dimension-reduction-tool/build-docker.sh | 22 +++++++ .../dimensionreduction.cwl | 32 +++++++++ transforms/dimension-reduction-tool/ict.yaml | 59 +++++++++++++++++ .../dimension-reduction-tool/plugin.json | 65 +++++++++++++++++++ .../dimension-reduction-tool/pyproject.toml | 24 +++++++ .../dimension-reduction-tool/run-plugin.sh | 22 +++++++ .../dimension_reduction/__init__.py | 9 +++ .../dimension_reduction/__main__.py | 63 ++++++++++++++++++ .../tests/__init__.py | 1 + .../tests/conftest.py | 14 ++++ 14 files changed, 412 insertions(+) create mode 100644 transforms/dimension-reduction-tool/.bumpversion.cfg create mode 100644 transforms/dimension-reduction-tool/Dockerfile create mode 100644 transforms/dimension-reduction-tool/README.md create mode 100644 transforms/dimension-reduction-tool/VERSION create mode 100644 transforms/dimension-reduction-tool/build-docker.sh create mode 100644 transforms/dimension-reduction-tool/dimensionreduction.cwl create mode 100644 transforms/dimension-reduction-tool/ict.yaml create mode 100644 transforms/dimension-reduction-tool/plugin.json create mode 100644 transforms/dimension-reduction-tool/pyproject.toml create mode 100644 transforms/dimension-reduction-tool/run-plugin.sh create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py create mode 100644 transforms/dimension-reduction-tool/tests/__init__.py create mode 100644 transforms/dimension-reduction-tool/tests/conftest.py diff --git a/transforms/dimension-reduction-tool/.bumpversion.cfg b/transforms/dimension-reduction-tool/.bumpversion.cfg new file mode 100644 index 0000000..d9fa9f2 --- /dev/null +++ b/transforms/dimension-reduction-tool/.bumpversion.cfg @@ -0,0 +1,35 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:CHANGELOG.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:dimensionreduciton.cwl] + +[bumpversion:file:src/polus/tabular/transforms/dimension_reduction/__init__.py] diff --git a/transforms/dimension-reduction-tool/Dockerfile b/transforms/dimension-reduction-tool/Dockerfile new file mode 100644 index 0000000..c783fd9 --- /dev/null +++ b/transforms/dimension-reduction-tool/Dockerfile @@ -0,0 +1,25 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# TODO: Change the tool_dir to the tool directory +ENV TOOL_DIR="transforms/dimension-reduction-tool" + +# Copy the repository into the container +RUN mkdir tabular-tools +COPY . ${EXEC_DIR}/tabular-tools + +# Install the tool +RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir + +# Set the entrypoint +# TODO: Change the entrypoint to the tool entrypoint +ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.dimension_reduction"] +CMD ["--help"] diff --git a/transforms/dimension-reduction-tool/README.md b/transforms/dimension-reduction-tool/README.md new file mode 100644 index 0000000..a63711e --- /dev/null +++ b/transforms/dimension-reduction-tool/README.md @@ -0,0 +1,40 @@ +# Dimension Reduction (v0.1.0-dev0) + +This tool is used to reduce the dimensionality of the input data. +It provides the following methods for dimensionality reduction: + +1. Principal Component Analysis (PCA) +2. t-Distributed Stochastic Neighbor Embedding (t-SNE) +3. Uniform Manifold Approximation and Projection (UMAP) + +The input data should be in the form of a tabular file (`.csv` or `.arrow`). +This tool takes tabular data as input and outputs a reduced dimensionality version of the input data. +Each method has its own set of parameters that can be tuned to get the desired output. + +The CLI parameters are: + +1. `--inpDir`: Directory containing input tabular data. +2. `--filePattern`: Pattern to parse tabular files. +3. `--preview`: Generate JSON file with outputs without running the tool. +4. `--outDir`: Output directory. + +## Docker Container + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. +Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes seven input arguments and one output argument: + +| Name | Description | I/O | Type | Default | +| ---------------- | ---------------------------------------------------------------------------- | ------ | ------------- | ------- | +| `--inpDir` | Directory containing input tabular data. | Input | genericData | N/A | +| `--filePattern` | Pattern to parse tabular files. | Input | string | ".*" | +| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | +| `--outDir` | Output directory. | Output | genericData | N/A | diff --git a/transforms/dimension-reduction-tool/VERSION b/transforms/dimension-reduction-tool/VERSION new file mode 100644 index 0000000..6b1a238 --- /dev/null +++ b/transforms/dimension-reduction-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev1 diff --git a/transforms/dimension-reduction-tool/build-docker.sh b/transforms/dimension-reduction-tool/build-docker.sh new file mode 100644 index 0000000..ede49ff --- /dev/null +++ b/transforms/dimension-reduction-tool/build-docker.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# TODO: Change the name of the tool here +tool_dir="transforms" +tool_name="dimension-reduction-tool" + +# The version is read from the VERSION file +version=$(", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.0" +typer = "^0.7.0" +numpy = "<2.0.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/transforms/dimension-reduction-tool/run-plugin.sh b/transforms/dimension-reduction-tool/run-plugin.sh new file mode 100644 index 0000000..7eb71e5 --- /dev/null +++ b/transforms/dimension-reduction-tool/run-plugin.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +version=$( None: + """CLI for the Dimension Reduction tool.""" + logger.info(f"inpDir = {inp_dir}") + logger.info(f"filePattern = {file_pattern}") + logger.info(f"outDir = {out_dir}") + logger.info(f"preview = {preview}") + + pass + + +if __name__ == "__main__": + app() diff --git a/transforms/dimension-reduction-tool/tests/__init__.py b/transforms/dimension-reduction-tool/tests/__init__.py new file mode 100644 index 0000000..f2fe897 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the dimension reduction tool.""" diff --git a/transforms/dimension-reduction-tool/tests/conftest.py b/transforms/dimension-reduction-tool/tests/conftest.py new file mode 100644 index 0000000..5760cb9 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/conftest.py @@ -0,0 +1,14 @@ +"""Configuration for pytest.""" + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) From 6ac0a6d1c3d726a4179cd72d1877a973866de42c Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Tue, 16 Jul 2024 15:43:37 -0400 Subject: [PATCH 02/14] fix: bumpversion config --- transforms/dimension-reduction-tool/.bumpversion.cfg | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/transforms/dimension-reduction-tool/.bumpversion.cfg b/transforms/dimension-reduction-tool/.bumpversion.cfg index d9fa9f2..fa56a60 100644 --- a/transforms/dimension-reduction-tool/.bumpversion.cfg +++ b/transforms/dimension-reduction-tool/.bumpversion.cfg @@ -26,10 +26,8 @@ replace = version = "{new_version}" [bumpversion:file:README.md] -[bumpversion:file:CHANGELOG.md] - [bumpversion:file:ict.yaml] -[bumpversion:file:dimensionreduciton.cwl] +[bumpversion:file:dimensionreduction.cwl] [bumpversion:file:src/polus/tabular/transforms/dimension_reduction/__init__.py] From cda57981c603fdc4ff0734233b65e49b464a7fb4 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Mon, 22 Jul 2024 15:29:31 -0400 Subject: [PATCH 03/14] feat: added algorithms --- .../dimension-reduction-tool/pyproject.toml | 10 + .../dimension_reduction/__init__.py | 63 +++++++ .../dimension_reduction/__main__.py | 172 +++++++++++++++++- .../algorithms/__init__.py | 104 +++++++++++ .../dimension_reduction/algorithms/pca.py | 46 +++++ .../dimension_reduction/algorithms/tsne.py | 104 +++++++++++ .../dimension_reduction/algorithms/umap.py | 63 +++++++ .../transforms/dimension_reduction/data_io.py | 59 ++++++ 8 files changed, 620 insertions(+), 1 deletion(-) create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py create mode 100644 transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py diff --git a/transforms/dimension-reduction-tool/pyproject.toml b/transforms/dimension-reduction-tool/pyproject.toml index 88705dc..525f569 100644 --- a/transforms/dimension-reduction-tool/pyproject.toml +++ b/transforms/dimension-reduction-tool/pyproject.toml @@ -13,6 +13,10 @@ python = ">=3.9,<3.12" filepattern = "^2.0.0" typer = "^0.7.0" numpy = "<2.0.0" +scikit-learn = "^1.5.1" +umap-learn = "^0.5.6" +pyarrow = ">=16.0,<17.0" +pandas = "^2.2.2" [tool.poetry.group.dev.dependencies] bump2version = "^1.0.1" @@ -22,3 +26,9 @@ pytest = "^7.2.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +ignore = [ + "PLR0913", # Too many arguments to function call +] diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py index e1b423d..3edd70d 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py @@ -2,8 +2,71 @@ import logging import os +import pathlib + +import numpy + +from .algorithms import Algorithm +from .algorithms import pca +from .algorithms import tsne +from .algorithms import umap +from .algorithms.pca import SvdSolver +from .data_io import Formats POLUS_LOG_LVL = os.environ.get("POLUS_LOG", logging.INFO) POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") __version__ = "0.1.0-dev1" + + +def reduce( + inp_path: pathlib.Path, + out_path: pathlib.Path, + algorithm: Algorithm, + kwargs: dict, +) -> None: + """Reduce the dimensionality of the data using the specified algorithm. + + The allowed formats for the input and output data are CSV, Parquet, Feather, + and NPY. + + The allowed algorithms are PCA, t-SNE, t-SNE with PCA initialization, and UMAP. + + Args: + inp_path: The path to the input data. + out_path: The path to write the reduced data. + algorithm: The algorithm to use for dimensionality reduction. + kwargs: Additional keyword arguments for the algorithm. + """ + data = Formats.read(inp_path) + reduced_data: numpy.ndarray + + if algorithm == Algorithm.PCA: + reduced_data = pca.reduce(data, **kwargs) + elif algorithm == Algorithm.TSNE: + reduced_data = tsne.reduce(data, **kwargs) + elif algorithm == Algorithm.TSNE_INIT_PCA: + reduced_data = tsne.reduce_init_pca(data, **kwargs) + elif algorithm == Algorithm.UMAP: + reduced_data = umap.reduce(data, **kwargs) + else: + allowed_algorithms = ", ".join(Algorithm.__members__.keys()) + msg = ( + f"Unsupported algorithm: {algorithm}. Must be one of: {allowed_algorithms}" + ) + raise ValueError(msg) + + Formats.write(reduced_data, out_path) + + +__all__ = [ + "pca", + "tsne", + "umap", + "POLUS_LOG_LVL", + "POLUS_TAB_EXT", + "__version__", + "SvdSolver", + "Algorithm", + "reduce", +] diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py index eba0ef0..585f34c 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py @@ -1,10 +1,17 @@ """CLI for the Dimension Reduction tool.""" +import json import logging import pathlib +import filepattern +import tqdm import typer from polus.tabular.transforms.dimension_reduction import POLUS_LOG_LVL +from polus.tabular.transforms.dimension_reduction import POLUS_TAB_EXT +from polus.tabular.transforms.dimension_reduction import Algorithm +from polus.tabular.transforms.dimension_reduction import SvdSolver +from polus.tabular.transforms.dimension_reduction import reduce # Initialize the logger logging.basicConfig( @@ -34,6 +41,115 @@ def main( "--filePattern", help="pattern to parse tabular files", ), + algorithm: Algorithm = typer.Option( + Algorithm.UMAP, + "--algorithm", + help="The algorithm to use for dimensionality reduction", + ), + n_components: int = typer.Option( + ..., + "--nComponents", + help="The dimensionality to reduce the data to", + ), + pca_whiten: bool = typer.Option( + False, + "--pcaWhiten", + help="PCA: Whether to whiten the data", + ), + pca_svd_solver: SvdSolver = typer.Option( + SvdSolver.AUTO, + "--pcaSvdSolver", + help="PCA: The singular value decomposition solver to use", + ), + pca_tol: float = typer.Option( + 0.0, + "--pcaTol", + help='PCA: Tolerance for singular values computed by svd_solver == "arpack"', + ), + tsne_perplexity: float = typer.Option( + 30.0, + "--tsnePerplexity", + help="t-SNE: The perplexity is related to the number of nearest neighbors " + "that is used in other manifold learning algorithms. Larger datasets " + "usually require a larger perplexity. Consider selecting a value between " + "5 and 50.", + ), + tsne_early_exaggeration: float = typer.Option( + 12.0, + "--tsneEarlyExaggeration", + help="t-SNE: Controls how tight natural clusters in the original space are in " + "the embedded space and how much space will be between them. For larger " + "values, the space between natural clusters will be larger in the embedded " + "space.", + ), + tsne_learning_rate: float = typer.Option( + 200.0, + "--tsneLearningRate", + help="The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If " + "the learning rate is too high, the data may look like a 'ball' with any " + "point approximately equidistant from its nearest neighbours. If the learning " + "rate is too low, most points may look compressed in a dense cloud with few " + "outliers. If the cost function gets stuck in a bad local minimum increasing " + "the learning rate may help.", + ), + tsne_max_iter: int = typer.Option( + 1000, + "--tsneMaxIter", + help="t-SNE: Maximum number of iterations for the optimization. Should be at " + "least 250.", + ), + tsne_metric: str = typer.Option( + "euclidean", + "--tsneMetric", + help="t-SNE: The metric to use when calculating distance between " + "instances in a feature array. It must be one of the options allowed by " + "scipy.spatial.distance.pdist for its metric parameter", + ), + tsne_init_n_components: int = typer.Option( + 50, + "--tsneInitNComponents", + help="t-SNE: The number of components to reduce to with PCA before running " + "t-SNE.", + ), + umap_n_neighbors: int = typer.Option( + 15, + "--umapNNeighbors", + help="UMAP: The size of local neighborhood (in terms of number of neighboring " + "sample points) used for manifold approximation. Larger values result in more " + "global views of the manifold, while smaller values result in more local data " + "being preserved. In general, values should be in the range 2 to 100.", + ), + umap_n_epochs: int = typer.Option( + None, + "--umapNEpochs", + help="UMAP: The number of training epochs to be used in optimizing the low " + "dimensional embedding. Larger values result in more accurate embeddings. If " + "None, the value will be set automatically based on the size of the input " + "dataset (200 for large datasets, 500 for small).", + ), + umap_min_dist: float = typer.Option( + 0.1, + "--umapMinDist", + help="UMAP: The effective minimum distance between embedded points. Smaller " + "values will result in a more clustered/clumped embedding where nearby points " + "on the manifold are drawn closer together, while larger values will result " + "in a more even dispersal of points. The value should be set relative to the " + "spread value, which determines the scale at which embedded points will be " + "spread out.", + ), + umap_spread: float = typer.Option( + 1.0, + "--umapSpread", + help="UMAP: The effective scale of embedded points. In combination with " + "min_dist this determines how clustered/clumped the embedded points are.", + ), + umap_metric: str = typer.Option( + "euclidean", + "--umapMetric", + help="UMAP: The metric to use when calculating distance between " + "instances in a feature array. It must be one of the options allowed by " + "scipy.spatial.distance.pdist for its metric parameter", + ), out_dir: pathlib.Path = typer.Option( ..., "--outDir", @@ -53,10 +169,64 @@ def main( """CLI for the Dimension Reduction tool.""" logger.info(f"inpDir = {inp_dir}") logger.info(f"filePattern = {file_pattern}") + logger.info(f"algorithm = {algorithm.value}") + logger.info(f"nComponents = {n_components}") + logger.info(f"pcaWhiten = {pca_whiten}") + logger.info(f"pcaSvdSolver = {pca_svd_solver.value}") + logger.info(f"pcaTol = {pca_tol}") + logger.info(f"tsnePerplexity = {tsne_perplexity}") + logger.info(f"tsneEarlyExaggeration = {tsne_early_exaggeration}") + logger.info(f"tsneLearningRate = {tsne_learning_rate}") + logger.info(f"tsneMaxIter = {tsne_max_iter}") + logger.info(f"tsneMetric = {tsne_metric}") + logger.info(f"tsneInitNComponents = {tsne_init_n_components}") + logger.info(f"umapNNeighbors = {umap_n_neighbors}") + logger.info(f"umapNEpochs = {umap_n_epochs}") + logger.info(f"umapMinDist = {umap_min_dist}") + logger.info(f"umapSpread = {umap_spread}") + logger.info(f"umapMetric = {umap_metric}") logger.info(f"outDir = {out_dir}") logger.info(f"preview = {preview}") - pass + kwargs = { + "n_components": n_components, + "pca_whiten": pca_whiten, + "pca_svd_solver": pca_svd_solver, + "pca_tol": pca_tol, + "tsne_perplexity": tsne_perplexity, + "tsne_early_exaggeration": tsne_early_exaggeration, + "tsne_learning_rate": tsne_learning_rate, + "tsne_max_iter": tsne_max_iter, + "tsne_metric": tsne_metric, + "tsne_init_n_components": tsne_init_n_components, + "umap_n_neighbors": umap_n_neighbors, + "umap_n_epochs": umap_n_epochs, + "umap_min_dist": umap_min_dist, + "umap_spread": umap_spread, + "umap_metric": umap_metric, + } + kwargs = algorithm.parse_kwargs(kwargs) + + fp = filepattern.FilePattern(path=inp_dir, pattern=file_pattern) + files: list[pathlib.Path] = list(map(pathlib.Path, fp())) + logger.info(f"Found {len(files)} files to process.") + + path: pathlib.Path + + if preview: + out_dict: dict[str, list[str]] = {"files": []} + for path in files: + out_dict["files"].append(str(out_dir / (path.stem + POLUS_TAB_EXT))) + with (out_dir / "preview.json").open("w") as f: + json.dump(out_dict, f, indent=2) + else: + for path in tqdm.tqdm(files): + reduce( + inp_path=path, + out_path=out_dir / (path.stem + POLUS_TAB_EXT), + algorithm=algorithm, + inp_kwargs=kwargs, + ) if __name__ == "__main__": diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py new file mode 100644 index 0000000..62cc4b4 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py @@ -0,0 +1,104 @@ +"""Dimension Reduction algorithms supported by this tool.""" + +import enum +import typing + +from . import pca +from . import tsne +from . import umap +from .pca import SvdSolver + + +class Algorithm(str, enum.Enum): + """The dimension reduction algorithms supported by this tool.""" + + PCA = "pca" + TSNE = "tsne" + TSNE_INIT_PCA = "tsne_init_pca" + UMAP = "umap" + + def parse_kwargs(self, inp_kwargs: dict) -> dict: # noqa: PLR0915, PLR0912, C901 + """Converts the inputs from the typer CLI to be used by the algorithms.""" + out_kwargs = {} + + if "n_components" in inp_kwargs: + out_kwargs["n_components"] = inp_kwargs["n_components"] + else: + msg = "n_components is a required argument." + raise ValueError(msg) + + if self == Algorithm.PCA: + expected_keys = ["whiten", "svd_solver", "tol"] + for key in expected_keys: + pca_key = f"pca_{key}" + if pca_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[pca_key] + else: + msg = f"{pca_key} is a required argument for PCA." + raise ValueError(msg) + elif self == Algorithm.TSNE: + expected_keys = [ + "perplexity", + "early_exaggeration", + "learning_rate", + "max_iter", + "metric", + ] + for key in expected_keys: + tsne_key = f"tsne_{key}" + if tsne_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[tsne_key] + else: + msg = f"{tsne_key} is a required argument for t-SNE." + raise ValueError(msg) + elif self == Algorithm.TSNE_INIT_PCA: + if "tsne_init_n_components" in inp_kwargs: + out_kwargs["pca_n_components"] = inp_kwargs["tsne_init_n_components"] + else: + msg = ( + "tsne_init_n_components is a required argument for t-SNE " + "with PCA initialization." + ) + raise ValueError(msg) + + pca_keys = ["whiten", "svd_solver", "tol"] + for key in pca_keys: + pca_key = f"pca_{key}" + if pca_key in inp_kwargs: + out_kwargs[pca_key] = inp_kwargs[pca_key] + else: + msg = f"{pca_key} is a required argument for PCA." + raise ValueError(msg) + + tsne_keys = [ + "perplexity", + "early_exaggeration", + "learning_rate", + "max_iter", + "metric", + ] + for key in tsne_keys: + tsne_key = f"tsne_{key}" + if tsne_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[tsne_key] + else: + msg = f"{tsne_key} is a required argument for t-SNE." + raise ValueError(msg) + elif self == Algorithm.UMAP: + expected_keys = ["n_neighbors", "n_epochs", "min_dist", "spread"] + for key in expected_keys: + umap_key = f"umap_{key}" + if umap_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[umap_key] + else: + msg = f"{umap_key} is a required argument for UMAP." + raise ValueError(msg) + else: + allowed_algorithms = ", ".join(Algorithm.__members__.keys()) + msg = f"Unsupported algorithm: {self}. Must be one of: {allowed_algorithms}" + raise ValueError(msg) + + return out_kwargs + + +__all__ = ["pca", "tsne", "umap", "SvdSolver", "Algorithm"] diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py new file mode 100644 index 0000000..27e3407 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py @@ -0,0 +1,46 @@ +"""Dimension reduction by Principal Component Analysis (PCA).""" + +import enum + +import numpy +import sklearn.decomposition + + +class SvdSolver(str, enum.Enum): + """The singular value decomposition solver to use.""" + + AUTO = "auto" + FULL = "full" + ARPACK = "arpack" + RANDOMIZED = "randomized" + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + whiten: bool = False, + svd_solver: SvdSolver = SvdSolver.AUTO, + tol: float = 0.0, +) -> numpy.ndarray: + """Reduce the dimensionality of the data using PCA. + + Args: + data: The data to reduce. + n_components: The number of components to reduce to. + whiten: Whether to whiten the data. Defaults to False. + svd_solver: The singular value decomposition solver to use. Defaults to + "auto". + tol: Tolerance for singular values computed by svd_solver == "arpack". + Must be of range [0.0, infinity). + + Returns: + The reduced data. + """ + pca = sklearn.decomposition.PCA( + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver.value, + tol=tol, + ) + return pca.fit_transform(data) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py new file mode 100644 index 0000000..7bce097 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py @@ -0,0 +1,104 @@ +"""Dimension reduction by t-distributed Stochastic Neighbor Embedding (t-SNE).""" + +import typing + +import numpy +import sklearn.manifold + +from . import pca + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + perplexity: float = 30.0, + early_exaggeration: float = 12.0, + learning_rate: typing.Union[float, typing.Literal["auto"]] = "auto", + max_iter: int = 1000, + metric: str = "euclidean", +) -> numpy.ndarray: + """Reduce the dimensionality of the data using t-SNE. + + Args: + data: The data to reduce. + + n_components: The number of components to reduce to. + + perplexity: The perplexity is related to the number of nearest neighbors + that is used in other manifold learning algorithms. Larger datasets + usually require a larger perplexity. The perplexity must be less + than the number of samples. + + early_exaggeration: Controls how tight natural clusters in the original + space are in the embedded space and how much space will be between them. + For larger values, the space between natural clusters will be larger in + the embedded space. + + learning_rate: The learning rate for t-SNE is usually in the range + [10.0, 1000.0]. If the learning rate is too high, the data may look like + a 'ball' with any point approximately equidistant from its nearest + neighbours. If the learning rate is too low, most points may look + compressed in a dense cloud with few outliers. If the cost function gets + stuck in a bad local minimum increasing the learning rate may help. + + max_iter: Maximum number of iterations for the optimization. Should be + at least 250. + + metric: The metric to use when calculating distance between instances in + a feature array. It must be one of the options allowed by + scipy.spatial.distance.pdist for its metric parameter, or a metric + listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + + Returns: + The reduced data. + """ + tsne = sklearn.manifold.TSNE( + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + return tsne.fit_transform(data) + + +def reduce_init_pca( + data: numpy.ndarray, + *, + pca_n_components: int, + pca_whiten: bool = False, + pca_svd_solver: pca.SvdSolver = pca.SvdSolver.AUTO, + pca_tol: float = 0.0, + n_components: int, + perplexity: float = 30.0, + early_exaggeration: float = 12.0, + learning_rate: typing.Union[float, typing.Literal["auto"]] = "auto", + max_iter: int = 1000, + metric: str = "euclidean", +) -> numpy.ndarray: + """Reduce the dimensionality of the data using PCA followed by t-SNE. + + This is useful when the data has a high number of dimensions and t-SNE + would be too slow to run directly. + + For the parameter documentation, see the `pca.reduce` and `tsne.reduce` + functions. + """ + pca_data = pca.reduce( + data, + n_components=pca_n_components, + whiten=pca_whiten, + svd_solver=pca_svd_solver, + tol=pca_tol, + ) + return reduce( + pca_data, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py new file mode 100644 index 0000000..b05a69c --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py @@ -0,0 +1,63 @@ +"""Dimension reduction by Uniform Manifold Approximation and Projection (UMAP).""" + +import typing + +import numpy +import umap + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + n_neighbors: int = 15, + metric: str = "euclidean", + n_epochs: typing.Optional[int] = None, + min_dist: float = 0.1, + spread: float = 1.0, +) -> numpy.ndarray: + """Reduce the dimensionality of the data using UMAP. + + Args: + data: The data to reduce. + + n_components: The number of components to reduce to. + + n_neighbors: The size of local neighborhood (in terms of number of + neighboring sample points) used for manifold approximation. Larger + values result in more global views of the manifold, while smaller + values result in more local data being preserved. In general, values + should be in the range 2 to 100. + + metric: The metric to use when calculating distance between instances in + the high dimensional space. It must be one of the options allowed by + scipy.spatial.distance.pdist for its metric parameter. + + n_epochs: The number of training epochs to be used in optimizing the + low dimensional embedding. Larger values result in more accurate + embeddings. If None, the value will be set automatically based on the + size of the input dataset (200 for large datasets, 500 for small). + + min_dist: The effective minimum distance between embedded points. + Smaller values will result in a more clustered/clumped embedding where + nearby points on the manifold are drawn closer together, while larger + values will result in a more even dispersal of points. The value should + be set relative to the spread value, which determines the scale at + which embedded points will be spread out. + + spread: The effective scale of embedded points. In combination with + ``min_dist`` this determines how clustered/clumped the embedded points + are. + + Returns: + The reduced data. + """ + reducer = umap.UMAP( + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + return reducer.fit_transform(data) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py new file mode 100644 index 0000000..c438f44 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py @@ -0,0 +1,59 @@ +"""Helpers for reading and writing data.""" + +import enum +import pathlib + +import numpy +import pandas + + +class Formats(str, enum.Enum): + """The data formats supported by this tool.""" + + CSV = "csv" + PARQUET = "parquet" + FEATHER = "feather" + NPY = "npy" + + @staticmethod + def read(path: pathlib.Path) -> numpy.ndarray: + """Read the data from the specified path.""" + # Read the extension of the file + ext = path.suffix + + data: numpy.ndarray + if ext == ".csv": + data = pandas.read_csv(path).to_numpy(dtype=numpy.float32) + elif ext == ".parquet": + data = pandas.read_parquet(path).to_numpy(dtype=numpy.float32) + elif ext == ".feather": + data = pandas.read_feather(path).to_numpy(dtype=numpy.float32) + elif ext == ".npy": + data = numpy.load(path) + data = data.astype(numpy.float32) + else: + allowed_formats = ", ".join(Formats.__members__.keys()) + msg = f"Unsupported file format: {ext}. Must be one of: {allowed_formats}" + raise ValueError(msg) + + @staticmethod + def write(data: numpy.ndarray, path: pathlib.Path) -> None: + """Write the data to the specified path.""" + # Write the extension of the file + ext = path.suffix + + if ext == ".csv": + pandas.DataFrame(data).to_csv(path, index=False) + elif ext == ".parquet": + pandas.DataFrame(data).to_parquet(path, index=False) + elif ext == ".feather": + pandas.DataFrame(data).to_feather(path) + elif ext == ".npy": + numpy.save(path, data) + else: + allowed_formats = ", ".join(Formats.__members__.keys()) + msg = f"Unsupported file format: {ext}. Must be one of: {allowed_formats}" + raise ValueError(msg) + + +__all__ = ["Formats"] From 75fad8f0e07ff9e7e81e4528b86b094073df7b85 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Mon, 22 Jul 2024 16:02:59 -0400 Subject: [PATCH 04/14] test: added tests --- .../dimension-reduction-tool/pyproject.toml | 2 + .../tests/test_tool.py | 196 ++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 transforms/dimension-reduction-tool/tests/test_tool.py diff --git a/transforms/dimension-reduction-tool/pyproject.toml b/transforms/dimension-reduction-tool/pyproject.toml index 525f569..dbc7db9 100644 --- a/transforms/dimension-reduction-tool/pyproject.toml +++ b/transforms/dimension-reduction-tool/pyproject.toml @@ -22,6 +22,8 @@ pandas = "^2.2.2" bump2version = "^1.0.1" pre-commit = "^3.0.4" pytest = "^7.2.1" +pytest-sugar = "^1.0.0" +pytest-xdist = "^3.6.1" [build-system] requires = ["poetry-core"] diff --git a/transforms/dimension-reduction-tool/tests/test_tool.py b/transforms/dimension-reduction-tool/tests/test_tool.py new file mode 100644 index 0000000..ce83455 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_tool.py @@ -0,0 +1,196 @@ +"""Tests for the tools.""" + +import copy +import pytest +import numpy +import sklearn.datasets + +from polus.tabular.transforms.dimension_reduction import algorithms + + +def gen_pca_args( + n_components: list[int] = [2, 3, 10], + whiten: list[bool] = [True, False], + svd_solver: list[algorithms.SvdSolver] = [ + algorithms.SvdSolver.AUTO, + algorithms.SvdSolver.FULL, + algorithms.SvdSolver.ARPACK, + algorithms.SvdSolver.RANDOMIZED, + ], + tol: list[float] = [0.0, 0.1, 0.5, 1.0], +) -> list[dict]: + """Generate arguments for the PCA algorithm.""" + all_kwargs = [] + for n in n_components: + for w in whiten: + for s in svd_solver: + if s == algorithms.SvdSolver.ARPACK: + for t in tol: + all_kwargs.append( + { + "n_components": n, + "whiten": w, + "svd_solver": s, + "tol": t, + } + ) + else: + all_kwargs.append( + { + "n_components": n, + "whiten": w, + "svd_solver": s, + "tol": 0.0, + } + ) + return all_kwargs + + +@pytest.mark.parametrize("kwargs", gen_pca_args()) +def test_pca(kwargs: dict): + """Test the PCA algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.pca.reduce(data.astype(numpy.float32), **kwargs) + + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == kwargs["n_components"] + assert reduced.dtype == numpy.float32 + + +def gen_tsne_args( + n_components: list[int] = [2, 3], + perplexity: list[float] = [5.0, 50.0], + early_exaggeration: list[float] = [5.0, 20.0], + learning_rate: list[float] = [200.0, "auto"], + max_iter: list[int] = [250, 1000], + metric: list[str] = ["euclidean", "cosine"], +) -> list[dict]: + """Generate arguments for testing the t-SNE algorithm.""" + all_kwargs = [] + for n in n_components: + for p in perplexity: + for e in early_exaggeration: + for l in learning_rate: + for m in max_iter: + for me in metric: + all_kwargs.append( + { + "n_components": n, + "perplexity": p, + "early_exaggeration": e, + "learning_rate": l, + "max_iter": m, + "metric": me, + } + ) + return all_kwargs + + +@pytest.mark.parametrize("kwargs", gen_tsne_args()) +def test_tsne(kwargs: dict): + """Test the t-SNE algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce(data.astype(numpy.float32), **kwargs) + + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == kwargs["n_components"] + assert reduced.dtype == numpy.float32 + + +def gen_tsne_pca_args( + n_components: list[int] = [2, 3], + pca_n_components: list[int] = [10, 50], + perplexity: list[float] = [10.0, 30.0, 50.0], +) -> list[dict]: + """Generate arguments for testing the t-SNE algorithm with PCA initialization.""" + base_kwargs = { + "pca_n_components": 2, + "pca_whiten": False, + "pca_svd_solver": algorithms.SvdSolver.AUTO, + "pca_tol": 0.0, + "early_exaggeration": 12.0, + "learning_rate": "auto", + "max_iter": 1000, + "metric": "euclidean", + } + all_kwargs = [] + for n in n_components: + for p in pca_n_components: + for pe in perplexity: + kwargs = copy.deepcopy(base_kwargs) + kwargs["n_components"] = n + kwargs["pca_n_components"] = p + kwargs["perplexity"] = pe + all_kwargs.append(kwargs) + return all_kwargs + + +@pytest.mark.parametrize("kwargs", gen_tsne_pca_args()) +def test_tsne_pca(kwargs: dict): + """Test the t-SNE algorithm with PCA initialization.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce_init_pca(data.astype(numpy.float32), **kwargs) + + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == kwargs["n_components"] + assert reduced.dtype == numpy.float32 + + +def gen_umap_args( + n_components: list[int] = [2, 3, 10], + n_neighbors: list[int] = [5, 15, 50], + metric: list[str] = ["euclidean", "cosine"], + n_epochs: list[int] = [None, 100], + min_dist: list[float] = [0.05, 0.1, 0.2], + spread: list[float] = [1.0, 2.0], +) -> list[dict]: + """Generate arguments for the UMAP algorithm.""" + all_kwargs = [] + for n in n_components: + for nn in n_neighbors: + for m in metric: + for ne in n_epochs: + for md in min_dist: + for s in spread: + all_kwargs.append( + { + "n_components": n, + "n_neighbors": nn, + "metric": m, + "n_epochs": ne, + "min_dist": md, + "spread": s, + } + ) + return all_kwargs + + +@pytest.mark.parametrize("kwargs", gen_umap_args()) +def test_umap(kwargs: dict): + """Test the UMAP algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.umap.reduce(data.astype(numpy.float32), **kwargs) + + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == kwargs["n_components"] + assert reduced.dtype == numpy.float32 From f9c6777611b8a1e3950e042634b860df39953691 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Tue, 23 Jul 2024 15:30:15 -0400 Subject: [PATCH 05/14] test: added lots more tests --- .../dimension-reduction-tool/Dockerfile | 2 +- .../dimension_reduction/__init__.py | 2 +- .../dimension_reduction/__main__.py | 5 +- .../transforms/dimension_reduction/data_io.py | 5 +- .../tests/test_cli.py | 228 ++++++++++++++++ .../tests/test_tool.py | 252 ++++++++---------- 6 files changed, 342 insertions(+), 152 deletions(-) create mode 100644 transforms/dimension-reduction-tool/tests/test_cli.py diff --git a/transforms/dimension-reduction-tool/Dockerfile b/transforms/dimension-reduction-tool/Dockerfile index c783fd9..61df5e5 100644 --- a/transforms/dimension-reduction-tool/Dockerfile +++ b/transforms/dimension-reduction-tool/Dockerfile @@ -3,7 +3,7 @@ FROM polusai/bfio:2.3.6 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".csv" +ENV POLUS_TAB_EXT=".feather" ENV POLUS_LOG="INFO" # Work directory defined in the base container diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py index 3edd70d..eb8d022 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py @@ -14,7 +14,7 @@ from .data_io import Formats POLUS_LOG_LVL = os.environ.get("POLUS_LOG", logging.INFO) -POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".feather") __version__ = "0.1.0-dev1" diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py index 585f34c..6c7c1b4 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py @@ -208,7 +208,8 @@ def main( kwargs = algorithm.parse_kwargs(kwargs) fp = filepattern.FilePattern(path=inp_dir, pattern=file_pattern) - files: list[pathlib.Path] = list(map(pathlib.Path, fp())) + files = [p for _, [p] in fp()] + logger.info(f"Found {len(files)} files to process.") path: pathlib.Path @@ -225,7 +226,7 @@ def main( inp_path=path, out_path=out_dir / (path.stem + POLUS_TAB_EXT), algorithm=algorithm, - inp_kwargs=kwargs, + kwargs=kwargs, ) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py index c438f44..e57f6f3 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py @@ -23,7 +23,8 @@ def read(path: pathlib.Path) -> numpy.ndarray: data: numpy.ndarray if ext == ".csv": - data = pandas.read_csv(path).to_numpy(dtype=numpy.float32) + df = pandas.read_csv(path) + data = df.to_numpy(dtype=numpy.float32) elif ext == ".parquet": data = pandas.read_parquet(path).to_numpy(dtype=numpy.float32) elif ext == ".feather": @@ -36,6 +37,8 @@ def read(path: pathlib.Path) -> numpy.ndarray: msg = f"Unsupported file format: {ext}. Must be one of: {allowed_formats}" raise ValueError(msg) + return data + @staticmethod def write(data: numpy.ndarray, path: pathlib.Path) -> None: """Write the data to the specified path.""" diff --git a/transforms/dimension-reduction-tool/tests/test_cli.py b/transforms/dimension-reduction-tool/tests/test_cli.py new file mode 100644 index 0000000..f829069 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_cli.py @@ -0,0 +1,228 @@ +"""Tests for the CLI.""" + +import copy +import pathlib +import tempfile + +from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm + +import numpy +import pytest +import sklearn.datasets +import typer.testing +from polus.tabular.transforms.dimension_reduction.data_io import Formats +from polus.tabular.transforms.dimension_reduction.__main__ import app + + +ALGORITHMS = [Algorithm.TSNE_INIT_PCA, Algorithm.UMAP] +FORMATS = ["csv", "feather"] + + +def create_data(inp_format: str) -> tuple[pathlib.Path, pathlib.Path]: + """Generate data.""" + + data_dir = pathlib.Path(tempfile.mkdtemp(suffix="_data_dir")) + + inp_dir = data_dir.joinpath("inp_dir") + inp_dir.mkdir() + + out_dir = data_dir.joinpath("out_dir") + out_dir.mkdir() + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + data = data.astype(numpy.float32) + Formats.write(data, inp_dir.joinpath(f"digits.{inp_format}")) + + return inp_dir, out_dir + + +@pytest.mark.parametrize("inp_format", FORMATS) +@pytest.mark.parametrize("out_format", FORMATS) +def test_data_io(inp_format: str, out_format: str) -> None: + """Test data IO.""" + + inp_dir, out_dir = create_data(inp_format) + assert inp_dir.exists() + assert out_dir.exists() + + inp_files: list[pathlib.Path] = list(inp_dir.iterdir()) + + assert len(inp_files) == 1 + assert inp_files[0].name == "digits." + inp_format + + out_path = out_dir.joinpath(inp_files[0].stem + f".{out_format}") + inp_data = Formats.read(inp_dir.joinpath(inp_files[0])) + Formats.write(inp_data, out_path) + + out_files: list[pathlib.Path] = list(out_dir.iterdir()) + assert len(out_files) == 1 + assert out_files[0].name == "digits." + out_format + + out_data = Formats.read(out_path) + + assert inp_data.shape == out_data.shape + assert inp_data.dtype == out_data.dtype + numpy.testing.assert_allclose(inp_data, out_data) + + +def gen_pca_args( + svd_solver: list[str] = ["auto", "arpack"], + tol: list[float] = [0.0, 0.1, 0.5, 1.0], +) -> list[dict]: + """Generate arguments for the PCA algorithm.""" + all_kwargs = [] + for s in svd_solver: + if s == "arpack": + for t in tol: + all_kwargs.append( + { + "pcaSvdSolver": s, + "pcaTol": t, + } + ) + else: + all_kwargs.append( + { + "pcaSvdSolver": s, + "pcaTol": 0.0, + } + ) + return all_kwargs + + +def gen_tsne_args( + perplexity: list[float] = [5.0, 50.0], + early_exaggeration: list[float] = [4.0, 24.0], + learning_rate: list[float] = [100.0, 200.0], + max_iter: list[int] = [250, 1000], + metric: list[str] = ["euclidean", "cosine"], +) -> list[dict]: + """Generate arguments for the t-SNE algorithm.""" + all_kwargs = [] + for p in perplexity: + for e in early_exaggeration: + for l in learning_rate: + for m in max_iter: + for me in metric: + all_kwargs.append( + { + "tsnePerplexity": p, + "tsneEarlyExaggeration": e, + "tsneLearningRate": l, + "tsneMaxIter": m, + "tsneMetric": me, + } + ) + return all_kwargs + + +def gen_tsne_pca_args( + perplexity: list[float] = [5.0, 50.0], + early_exaggeration: list[float] = [4.0, 24.0], + learning_rate: list[float] = [100.0, 200.0], + max_iter: list[int] = [250, 1000], + metric: list[str] = ["euclidean", "cosine"], + init_n_components: list[int] = [10, 50], +) -> list[dict]: + """Generate arguments for the t-SNE algorithm with PCA initialization.""" + tsne_kwargs = gen_tsne_args( + perplexity, early_exaggeration, learning_rate, max_iter, metric + ) + all_kwargs = [] + for inp_kwargs in tsne_kwargs: + for n in init_n_components: + kwargs = copy.deepcopy(inp_kwargs) + kwargs["tsneInitNComponents"] = n + all_kwargs.append(kwargs) + return all_kwargs + + +def gen_umap_args( + n_neighbors: list[int] = [5, 15, 50], + n_epochs: list[int] = [200, 500], + min_dist: list[float] = [0.1, 0.5], + spread: list[float] = [1.0, 2.0], + metric: list[str] = ["euclidean", "cosine"], +) -> list[dict]: + """Generate arguments for the UMAP algorithm.""" + all_kwargs = [] + for n in n_neighbors: + for e in n_epochs: + for m in min_dist: + for s in spread: + for me in metric: + all_kwargs.append( + { + "umapNNeighbors": n, + "umapNEpochs": e, + "umapMinDist": m, + "umapSpread": s, + "umapMetric": me, + } + ) + return all_kwargs + + +@pytest.mark.parametrize("inp_format", FORMATS) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("n_components", [3]) +def test_cli( + inp_format: str, + algorithm: Algorithm, + n_components: int, +) -> None: + """Test the CLI.""" + + inp_dir, out_dir = create_data(inp_format) + + base_kwargs = { + "inpDir": str(inp_dir), + "outDir": str(out_dir), + "nComponents": str(n_components), + "algorithm": algorithm.value, + } + all_kwargs: list[dict] = [] + if algorithm == Algorithm.PCA: + all_kwargs = gen_pca_args() + elif algorithm == Algorithm.TSNE: + all_kwargs = gen_tsne_args() + elif algorithm == Algorithm.TSNE_INIT_PCA: + all_kwargs = gen_tsne_pca_args() + elif algorithm == Algorithm.UMAP: + all_kwargs = gen_umap_args() + else: + raise ValueError(f"Unknown algorithm {algorithm}") + + for inp_kwargs in all_kwargs: + kwargs = copy.deepcopy(base_kwargs) + kwargs.update(inp_kwargs) + + args = [] + for k, v in kwargs.items(): + args.extend(["--" + k, str(v)]) + + runner = typer.testing.CliRunner() + result = runner.invoke(app, args) + + assert result.exit_code == 0, f"CLI failed with {result.stdout}\n{args}" + + inp_dir = pathlib.Path(kwargs["inpDir"]) + out_dir = pathlib.Path(kwargs["outDir"]) + inp_files: list[pathlib.Path] = [p for p in inp_dir.iterdir()] + out_files: list[pathlib.Path] = [p for p in out_dir.iterdir()] + + assert len(inp_files) == 1 + assert len(out_files) == 1 + + for inp_path in inp_files: + out_path = out_dir.joinpath(inp_path.stem + ".feather") + msg = f"Missing {inp_path.stem} from {inp_files} in {out_files}\n{args}" + assert out_path in out_files, msg + + data = Formats.read(out_path) + assert data.shape == (1797, n_components) + assert data.dtype == numpy.float32 + + for out_path in out_files: + out_path.unlink() diff --git a/transforms/dimension-reduction-tool/tests/test_tool.py b/transforms/dimension-reduction-tool/tests/test_tool.py index ce83455..99bf3f1 100644 --- a/transforms/dimension-reduction-tool/tests/test_tool.py +++ b/transforms/dimension-reduction-tool/tests/test_tool.py @@ -1,53 +1,29 @@ """Tests for the tools.""" -import copy import pytest import numpy import sklearn.datasets from polus.tabular.transforms.dimension_reduction import algorithms - -def gen_pca_args( - n_components: list[int] = [2, 3, 10], - whiten: list[bool] = [True, False], - svd_solver: list[algorithms.SvdSolver] = [ - algorithms.SvdSolver.AUTO, - algorithms.SvdSolver.FULL, - algorithms.SvdSolver.ARPACK, - algorithms.SvdSolver.RANDOMIZED, - ], - tol: list[float] = [0.0, 0.1, 0.5, 1.0], -) -> list[dict]: - """Generate arguments for the PCA algorithm.""" - all_kwargs = [] - for n in n_components: - for w in whiten: - for s in svd_solver: - if s == algorithms.SvdSolver.ARPACK: - for t in tol: - all_kwargs.append( - { - "n_components": n, - "whiten": w, - "svd_solver": s, - "tol": t, - } - ) - else: - all_kwargs.append( - { - "n_components": n, - "whiten": w, - "svd_solver": s, - "tol": 0.0, - } - ) - return all_kwargs - - -@pytest.mark.parametrize("kwargs", gen_pca_args()) -def test_pca(kwargs: dict): +SVD_SOLVERS = [ + algorithms.SvdSolver.AUTO, + algorithms.SvdSolver.FULL, + algorithms.SvdSolver.ARPACK, + algorithms.SvdSolver.RANDOMIZED, +] + + +@pytest.mark.parametrize("n_components", [2, 10]) +@pytest.mark.parametrize("whiten", [True, False]) +@pytest.mark.parametrize("svd_solver", SVD_SOLVERS) +@pytest.mark.parametrize("tol", [0.0, 0.5]) +def test_pca( + n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, +): """Test the PCA algorithm.""" digits = sklearn.datasets.load_digits() @@ -55,44 +31,34 @@ def test_pca(kwargs: dict): assert data.shape == (1797, 64) - reduced = algorithms.pca.reduce(data.astype(numpy.float32), **kwargs) - - assert reduced.shape[0] == 1797 - assert reduced.shape[1] == kwargs["n_components"] + reduced = algorithms.pca.reduce( + data.astype(numpy.float32), + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver, + tol=tol, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == data.shape[0] + assert reduced.shape[1] == n_components assert reduced.dtype == numpy.float32 -def gen_tsne_args( - n_components: list[int] = [2, 3], - perplexity: list[float] = [5.0, 50.0], - early_exaggeration: list[float] = [5.0, 20.0], - learning_rate: list[float] = [200.0, "auto"], - max_iter: list[int] = [250, 1000], - metric: list[str] = ["euclidean", "cosine"], -) -> list[dict]: - """Generate arguments for testing the t-SNE algorithm.""" - all_kwargs = [] - for n in n_components: - for p in perplexity: - for e in early_exaggeration: - for l in learning_rate: - for m in max_iter: - for me in metric: - all_kwargs.append( - { - "n_components": n, - "perplexity": p, - "early_exaggeration": e, - "learning_rate": l, - "max_iter": m, - "metric": me, - } - ) - return all_kwargs - - -@pytest.mark.parametrize("kwargs", gen_tsne_args()) -def test_tsne(kwargs: dict): +@pytest.mark.parametrize("n_components", [3]) +@pytest.mark.parametrize("perplexity", [5.0, 50.0]) +@pytest.mark.parametrize("early_exaggeration", [5.0, 20.0]) +@pytest.mark.parametrize("learning_rate", [200.0, "auto"]) +@pytest.mark.parametrize("max_iter", [250, 1000]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_tsne( + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): """Test the t-SNE algorithm.""" digits = sklearn.datasets.load_digits() @@ -100,43 +66,30 @@ def test_tsne(kwargs: dict): assert data.shape == (1797, 64) - reduced = algorithms.tsne.reduce(data.astype(numpy.float32), **kwargs) - + reduced = algorithms.tsne.reduce( + data.astype(numpy.float32), + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim assert reduced.shape[0] == 1797 - assert reduced.shape[1] == kwargs["n_components"] + assert reduced.shape[1] == n_components assert reduced.dtype == numpy.float32 -def gen_tsne_pca_args( - n_components: list[int] = [2, 3], - pca_n_components: list[int] = [10, 50], - perplexity: list[float] = [10.0, 30.0, 50.0], -) -> list[dict]: - """Generate arguments for testing the t-SNE algorithm with PCA initialization.""" - base_kwargs = { - "pca_n_components": 2, - "pca_whiten": False, - "pca_svd_solver": algorithms.SvdSolver.AUTO, - "pca_tol": 0.0, - "early_exaggeration": 12.0, - "learning_rate": "auto", - "max_iter": 1000, - "metric": "euclidean", - } - all_kwargs = [] - for n in n_components: - for p in pca_n_components: - for pe in perplexity: - kwargs = copy.deepcopy(base_kwargs) - kwargs["n_components"] = n - kwargs["pca_n_components"] = p - kwargs["perplexity"] = pe - all_kwargs.append(kwargs) - return all_kwargs - - -@pytest.mark.parametrize("kwargs", gen_tsne_pca_args()) -def test_tsne_pca(kwargs: dict): +@pytest.mark.parametrize("n_components", [2, 3]) +@pytest.mark.parametrize("pca_n_components", [10, 50]) +@pytest.mark.parametrize("perplexity", [5.0, 50.0]) +def test_tsne_pca( + n_components: int, + pca_n_components: int, + perplexity: float, +): """Test the t-SNE algorithm with PCA initialization.""" digits = sklearn.datasets.load_digits() @@ -144,44 +97,40 @@ def test_tsne_pca(kwargs: dict): assert data.shape == (1797, 64) - reduced = algorithms.tsne.reduce_init_pca(data.astype(numpy.float32), **kwargs) - + reduced = algorithms.tsne.reduce_init_pca( + data.astype(numpy.float32), + pca_n_components=pca_n_components, + pca_whiten=False, + pca_svd_solver=algorithms.SvdSolver.AUTO, + pca_tol=0.0, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=12.0, + learning_rate="auto", + max_iter=1000, + metric="euclidean", + ) + + assert reduced.ndim == data.ndim assert reduced.shape[0] == 1797 - assert reduced.shape[1] == kwargs["n_components"] + assert reduced.shape[1] == n_components assert reduced.dtype == numpy.float32 -def gen_umap_args( - n_components: list[int] = [2, 3, 10], - n_neighbors: list[int] = [5, 15, 50], - metric: list[str] = ["euclidean", "cosine"], - n_epochs: list[int] = [None, 100], - min_dist: list[float] = [0.05, 0.1, 0.2], - spread: list[float] = [1.0, 2.0], -) -> list[dict]: - """Generate arguments for the UMAP algorithm.""" - all_kwargs = [] - for n in n_components: - for nn in n_neighbors: - for m in metric: - for ne in n_epochs: - for md in min_dist: - for s in spread: - all_kwargs.append( - { - "n_components": n, - "n_neighbors": nn, - "metric": m, - "n_epochs": ne, - "min_dist": md, - "spread": s, - } - ) - return all_kwargs - - -@pytest.mark.parametrize("kwargs", gen_umap_args()) -def test_umap(kwargs: dict): +@pytest.mark.parametrize("n_components", [3, 10]) +@pytest.mark.parametrize("n_neighbors", [10, 25]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +@pytest.mark.parametrize("n_epochs", [None, 100]) +@pytest.mark.parametrize("min_dist", [0.05, 0.2]) +@pytest.mark.parametrize("spread", [1.0, 2.0]) +def test_umap( + n_components: int, + n_neighbors: int, + metric: str, + n_epochs: int, + min_dist: float, + spread: float, +): """Test the UMAP algorithm.""" digits = sklearn.datasets.load_digits() @@ -189,8 +138,17 @@ def test_umap(kwargs: dict): assert data.shape == (1797, 64) - reduced = algorithms.umap.reduce(data.astype(numpy.float32), **kwargs) - + reduced = algorithms.umap.reduce( + data.astype(numpy.float32), + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + + assert reduced.ndim == data.ndim assert reduced.shape[0] == 1797 - assert reduced.shape[1] == kwargs["n_components"] + assert reduced.shape[1] == n_components assert reduced.dtype == numpy.float32 From 2ece3e9d4c488acbfa2e94ae944cdf2bcfd664d9 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 10:16:16 -0400 Subject: [PATCH 06/14] docs: updated README --- transforms/dimension-reduction-tool/README.md | 76 ++++++++++++++++--- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/transforms/dimension-reduction-tool/README.md b/transforms/dimension-reduction-tool/README.md index a63711e..85f8d15 100644 --- a/transforms/dimension-reduction-tool/README.md +++ b/transforms/dimension-reduction-tool/README.md @@ -5,18 +5,56 @@ It provides the following methods for dimensionality reduction: 1. Principal Component Analysis (PCA) 2. t-Distributed Stochastic Neighbor Embedding (t-SNE) -3. Uniform Manifold Approximation and Projection (UMAP) +3. t-SNE with PCA initialization. +4. Uniform Manifold Approximation and Projection (UMAP) -The input data should be in the form of a tabular file (`.csv` or `.arrow`). +The input data should be in the form of a tabular file (`.csv`, `.feather`, `parquet` or `npy`). This tool takes tabular data as input and outputs a reduced dimensionality version of the input data. Each method has its own set of parameters that can be tuned to get the desired output. -The CLI parameters are: +The CLI parameters required for all methods are: 1. `--inpDir`: Directory containing input tabular data. 2. `--filePattern`: Pattern to parse tabular files. -3. `--preview`: Generate JSON file with outputs without running the tool. -4. `--outDir`: Output directory. +3. `--algorithm`: Dimensionality reduction algorithm to use. Options are `pca`, `tsne`, `tsne_init_pca`, and `umap`. +4. `--nComponents`: Number of dimensions to reduce to. +5. `--outDir`: Output directory. + +You can also use the `--preview` flag to generate a JSON file indicating what the outputs would be without running the tool. + +For PCA, the required parameters are: + +- `--pcaWhiten`: Boolean flag to indicate whether to whiten the data. +- `--pcaSvdSolver`: Solver to use for PCA. Options are `auto`, `full`, `arpack`, and `randomized`. +- `--pcaTol`: Tolerance for PCA with the `arpack` solver. + +For more details in each parameter, see [the documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html). + +For t-SNE, the required parameters are: + +- `--tsnePerplexity`: Perplexity parameter for t-SNE. +- `--tsneEarlyExaggeration`: Early exaggeration factor for t-SNE. +- `--tsneLearningRate`: Learning rate for t-SNE. +- `--tsneMaxIter`: Maximum number of iterations for t-SNE. +- `--tsneMetric`: The distance metric to use for t-SNE. + +for more details in each parameter, see [the documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). + +For t-SNE with PCA initialization, the required parameters are: + +- All parameters required for t-SNE. +- `--tsneInitNComponents`: Number of components to use for PCA initialization. +- All parameters required for PCA. + +For UMAP, the required parameters are: + +- `--umapNNeighbors`: Number of neighbors to use for UMAP. +- `--umapNEpochs`: Number of epochs for UMAP. +- `--umapMinDist`: Minimum distance between points in UMAP. +- `--umapSpread`: Spread of UMAP. +- `--umapMetric`: The distance metric to use for UMAP. + +For more details in each parameter, see [the documentation here](https://umap-learn.readthedocs.io/en/latest/parameters.html). ## Docker Container @@ -32,9 +70,25 @@ For more information on WIPP, visit the [official WIPP page](https://isg.nist.go This plugin takes seven input arguments and one output argument: -| Name | Description | I/O | Type | Default | -| ---------------- | ---------------------------------------------------------------------------- | ------ | ------------- | ------- | -| `--inpDir` | Directory containing input tabular data. | Input | genericData | N/A | -| `--filePattern` | Pattern to parse tabular files. | Input | string | ".*" | -| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | -| `--outDir` | Output directory. | Output | genericData | N/A | +| Name | Description | I/O | Type | Default | +| ------------------------- | --------------------------------------------------------- | ------ | ----------- | --------- | +| `--inpDir` | Directory containing input tabular data. | Input | genericData | N/A | +| `--filePattern` | Pattern to parse tabular files. | Input | string | ".*" | +| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | +| `--outDir` | Output directory. | Output | genericData | N/A | +| `--algorithm` | Dimensionality reduction algorithm to use. | Input | enum | umap | +| `--nComponents` | Number of dimensions to reduce to. | Input | int | | +| `--pcaWhiten` | Boolean flag to indicate whether to whiten the data. | Input | boolean | False | +| `--pcaSvdSolver` | Solver to use for PCA. | Input | enum | auto | +| `--pcaTol` | Tolerance for PCA with the `arpack` solver. | Input | float | 0.0 | +| `--tsnePerplexity` | Perplexity parameter for t-SNE. | Input | float | 30.0 | +| `--tsneEarlyExaggeration` | Early exaggeration factor for t-SNE. | Input | float | 12.0 | +| `--tsneLearningRate` | Learning rate for t-SNE. | Input | float | 200.0 | +| `--tsneMaxIter` | Maximum number of iterations for t-SNE. | Input | int | 1000 | +| `--tsneMetric` | The distance metric to use for t-SNE. | Input | string | euclidean | +| `--tsneInitNComponents` | Number of components to use for PCA initialization. | Input | int | 50 | +| `--umapNNeighbors` | Number of neighbors to use for UMAP. | Input | int | 15 | +| `--umapNEpochs` | Number of epochs for UMAP. | Input | int | 500 | +| `--umapMinDist` | Minimum distance between points in UMAP. | Input | float | 0.1 | +| `--umapSpread` | Spread of UMAP. | Input | float | 1.0 | +| `--umapMetric` | The distance metric to use for UMAP. | Input | string | euclidean | From db05e86f14fb5ca7ec90aaea2400e5559ca42c90 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 10:24:33 -0400 Subject: [PATCH 07/14] chore: updated plugin.json --- .../dimension-reduction-tool/plugin.json | 208 +++++++++++++++++- 1 file changed, 201 insertions(+), 7 deletions(-) diff --git a/transforms/dimension-reduction-tool/plugin.json b/transforms/dimension-reduction-tool/plugin.json index d61c2a3..5471b9d 100644 --- a/transforms/dimension-reduction-tool/plugin.json +++ b/transforms/dimension-reduction-tool/plugin.json @@ -18,20 +18,126 @@ { "name": "inpDir", "description": "Input tabular data", - "type": "genericData", - "required": "true" + "required": true, + "type": "genericData" }, { "name": "filePattern", "description": "Pattern to parse input files", - "type": "string", - "required": false + "required": false, + "type": "string" }, { "name": "preview", "description": "Output a JSON preview of outputs produced by this plugin", - "type": "boolean", - "required": false + "required": false, + "type": "boolean" + }, + { + "name": "algorithm", + "description": "Dimension reduction algorithm", + "required": false, + "type": "enum", + "options": { + "values": [ + "pca", + "tsne", + "tsne_init_pca", + "umap" + ] + } + }, + { + "name": "nComponents", + "description": "Number of components to keep", + "required": true, + "type": "integer" + }, + { + "name": "pcaWhiten", + "description": "Whiten PCA components", + "required": false, + "type": "boolean" + }, + { + "name": "pcaSvdSolver", + "description": "SVD solver for PCA", + "required": false, + "type": "enum", + "options": { + "values": [ + "auto", + "full", + "arpack", + "randomized" + ] + } + }, + { + "name": "pcaTol", + "description": "Tolerance for PCA when using arpack solver", + "required": false, + "type": "float" + }, + { + "name": "tsnePerplexity", + "description": "Perplexity for t-SNE", + "required": false, + "type": "float" + }, + { + "name": "tsneEarlyExaggeration", + "description": "Early exaggeration for t-SNE", + "required": false, + "type": "float" + }, + { + "name": "tsneLearningRate", + "description": "Learning rate for t-SNE", + "required": false, + "type": "float" + }, + { + "name": "tsneMaxIter", + "description": "Maximum number of iterations for t-SNE", + "required": false, + "type": "integer" + }, + { + "name": "tsneMetric", + "description": "Metric for t-SNE", + "required": false, + "type": "string" + }, + { + "name": "tsneInitNComponents", + "description": "Number of PCA components to initialize t-SNE", + "required": false, + "type": "integer" + }, + { + "name": "umapNNeighbors", + "description": "Number of neighbors for UMAP", + "required": false, + "type": "integer" + }, + { + "name": "umapMinDist", + "description": "Minimum distance for UMAP", + "required": false, + "type": "float" + }, + { + "name": "umapSpread", + "description": "Spread for UMAP", + "required": false, + "type": "float" + }, + { + "name": "umapMetric", + "description": "Metric for UMAP", + "required": false, + "type": "string" } ], "outputs": [ @@ -58,8 +164,96 @@ "key": "inputs.preview", "title": "Preview", "description": "Output a JSON preview of outputs produced by this plugin", - "type": "boolean", "default": false + }, + { + "key": "inputs.algorithm", + "title": "Algorithm", + "description": "Dimension reduction algorithm", + "default": "pca" + }, + { + "key": "inputs.nComponents", + "title": "Number of components", + "description": "Number of components to keep" + }, + { + "key": "inputs.pcaWhiten", + "title": "Whiten", + "description": "Whiten PCA components", + "default": false + }, + { + "key": "inputs.pcaSvdSolver", + "title": "SVD Solver", + "description": "SVD solver for PCA", + "default": "auto" + }, + { + "key": "inputs.pcaTol", + "title": "Tolerance", + "description": "Tolerance for PCA when using arpack solver", + "default": 0.0 + }, + { + "key": "inputs.tsnePerplexity", + "title": "Perplexity", + "description": "Perplexity for t-SNE", + "default": 30.0 + }, + { + "key": "inputs.tsneEarlyExaggeration", + "title": "Early Exaggeration", + "description": "Early exaggeration for t-SNE", + "default": 12.0 + }, + { + "key": "inputs.tsneLearningRate", + "title": "Learning Rate", + "description": "Learning rate for t-SNE", + "default": 200.0 + }, + { + "key": "inputs.tsneMaxIter", + "title": "Max Iterations", + "description": "Maximum number of iterations for t-SNE", + "default": 1000 + }, + { + "key": "inputs.tsneMetric", + "title": "Metric", + "description": "Metric for t-SNE", + "default": "euclidean" + }, + { + "key": "inputs.tsneInitNComponents", + "title": "Init N Components", + "description": "Number of PCA components to initialize t-SNE", + "default": 50 + }, + { + "key": "inputs.umapNNeighbors", + "title": "N Neighbors", + "description": "Number of neighbors for UMAP", + "default": 15 + }, + { + "key": "inputs.umapMinDist", + "title": "Min Dist", + "description": "Minimum distance for UMAP", + "default": 0.1 + }, + { + "key": "inputs.umapSpread", + "title": "Spread", + "description": "Spread for UMAP", + "default": 1.0 + }, + { + "key": "inputs.umapMetric", + "title": "Metric", + "description": "Metric for UMAP", + "default": "euclidean" } ] } From 111f082f46acda5336ab70f054b84ebe8bdc8dfe Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 10:32:34 -0400 Subject: [PATCH 08/14] chore: update ict --- transforms/dimension-reduction-tool/ict.yaml | 211 ++++++++++++++++++- 1 file changed, 209 insertions(+), 2 deletions(-) diff --git a/transforms/dimension-reduction-tool/ict.yaml b/transforms/dimension-reduction-tool/ict.yaml index f44c1b2..0a42a1a 100644 --- a/transforms/dimension-reduction-tool/ict.yaml +++ b/transforms/dimension-reduction-tool/ict.yaml @@ -27,7 +27,119 @@ inputs: required: false type: boolean -name: polusai/K-MeansClustering +- description: The algorithm to use for dimension reduction + format: + - enum + name: algorithm + required: false + type: string + +- description: The number of components to keep + format: + - integer + name: nComponents + required: false + type: integer + +- description: PCA, whether to whiten the data + format: + - boolean + name: pcaWhiten + required: false + type: boolean + +- description: PCA, which SVD solver to use + format: + - enum + name: pcaSvdSolver + required: false + type: string + +- description: PCA, what tolerance to use with the arpack solver + format: + - float + name: pcaTol + required: false + type: float + +- description: t-SNE, the perplexity parameter + format: + - float + name: tsnePerplexity + required: false + type: float + +- description: t-SNE, the early exaggeration parameter + format: + - float + name: tsneEarlyExaggeration + required: false + type: float + +- description: t-SNE, the learning rate parameter + format: + - float + name: tsneLearningRate + required: false + type: float + +- description: t-SNE, the maximum number of iterations + format: + - integer + name: tsneMaxIter + required: false + type: integer + +- description: t-SNE, the distance metric to use + format: + - enum + name: tsneMetric + required: false + type: string + +- description: t-SNE, the number of components to initialize with PCA + format: + - integer + name: tsneInitNComponents + required: false + type: integer + +- description: UMAP, the number of neighbors to use + format: + - integer + name: umapNNeighbors + required: false + type: integer + +- description: UMAP, the number of epochs to use + format: + - integer + name: umapNEpochs + required: false + type: integer + +- description: UMAP, the minimum distance between points + format: + - float + name: umapMinDist + required: false + type: float + +- description: UMAP, the spread of the embedding + format: + - float + name: umapSpread + required: false + type: float + +- description: UMAP, the metric to use + format: + - enum + name: umapMetric + required: false + type: string + +name: polusai/dimension-reduction outputs: @@ -46,7 +158,7 @@ title: Dimension Reduction ui: -- description: Input tabular data for clustering +- description: Input tabular data for dimension reduction key: inputs.inpDir title: Input tabular data type: path @@ -56,4 +168,99 @@ ui: title: FilePattern type: text +- description: Output a JSON preview of outputs produced by this plugin + key: inputs.preview + title: Preview + type: boolean + +- description: The algorithm to use for dimension reduction + fields: + - pca + - tsne + - tsne_init_pca + - umap + key: inputs.algorithm + title: Algorithm + type: string + +- description: The number of components to keep + key: inputs.nComponents + title: nComponents + type: integer + +- description: PCA, whether to whiten the data + key: inputs.pcaWhiten + title: pcaWhiten + type: boolean + +- description: PCA, which SVD solver to use + fields: + - auto + - full + - arpack + - randomized + key: inputs.pcaSvdSolver + title: pcaSvdSolver + type: string + +- description: PCA, what tolerance to use with the arpack solver + key: inputs.pcaTol + title: pcaTol + type: float + +- description: t-SNE, the perplexity parameter + key: inputs.tsnePerplexity + title: tsnePerplexity + type: float + +- description: t-SNE, the early exaggeration parameter + key: inputs.tsneEarlyExaggeration + title: tsneEarlyExaggeration + type: float + +- description: t-SNE, the learning rate parameter + key: inputs.tsneLearningRate + title: tsneLearningRate + type: float + +- description: t-SNE, the maximum number of iterations + key: inputs.tsneMaxIter + title: tsneMaxIter + type: integer + +- description: t-SNE, the distance metric to use + key: inputs.tsneMetric + title: tsneMetric + type: string + +- description: t-SNE, the number of components to initialize with PCA + key: inputs.tsneInitNComponents + title: tsneInitNComponents + type: integer + +- description: UMAP, the number of neighbors to use + key: inputs.umapNNeighbors + title: umapNNeighbors + type: integer + +- description: UMAP, the number of epochs to use + key: inputs.umapNEpochs + title: umapNEpochs + type: integer + +- description: UMAP, the minimum distance between points + key: inputs.umapMinDist + title: umapMinDist + type: float + +- description: UMAP, the spread of the embedding + key: inputs.umapSpread + title: umapSpread + type: float + +- description: UMAP, the metric to use + key: inputs.umapMetric + title: umapMetric + type: string + version: 0.1.0-dev0 From dec89eabbeeb9ef1aeda853828f68596e522412f Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 10:36:14 -0400 Subject: [PATCH 09/14] chore: update cwl --- .../dimensionreduction.cwl | 72 +++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/transforms/dimension-reduction-tool/dimensionreduction.cwl b/transforms/dimension-reduction-tool/dimensionreduction.cwl index 6066c58..9153b1e 100644 --- a/transforms/dimension-reduction-tool/dimensionreduction.cwl +++ b/transforms/dimension-reduction-tool/dimensionreduction.cwl @@ -1,14 +1,14 @@ class: CommandLineTool cwlVersion: v1.2 inputs: - filePattern: - inputBinding: - prefix: --filePattern - type: string? inpDir: inputBinding: prefix: --inpDir type: Directory + filePattern: + inputBinding: + prefix: --filePattern + type: string? outDir: inputBinding: prefix: --outDir @@ -17,6 +17,70 @@ inputs: inputBinding: prefix: --preview type: boolean? + algorithm: + inputBinding: + prefix: --algorithm + type: string? + nComponents: + inputBinding: + prefix: --nComponents + type: int + pcaWhiten: + inputBinding: + prefix: --pcaWhiten + type: boolean? + pcaSvdSolver: + inputBinding: + prefix: --pcaSvdSolver + type: string? + pcaTol: + inputBinding: + prefix: --pcaTol + type: float? + tsnePerplexity: + inputBinding: + prefix: --tsnePerplexity + type: float? + tsneEarlyExaggeration: + inputBinding: + prefix: --tsneEarlyExaggeration + type: float? + tsneLearningRate: + inputBinding: + prefix: --tsneLearningRate + type: float? + tsneMaxIter: + inputBinding: + prefix: --tsneMaxIter + type: int? + tsneMetric: + inputBinding: + prefix: --tsneMetric + type: string? + tsneInitNComponents: + inputBinding: + prefix: --tsneInitNComponents + type: int? + umapNNeighbors: + inputBinding: + prefix: --umapNNeighbors + type: int? + umapNEpochs: + inputBinding: + prefix: --umapNEpochs + type: int? + umapMinDist: + inputBinding: + prefix: --umapMinDist + type: float? + umapSpread: + inputBinding: + prefix: --umapSpread + type: float? + umapMetric: + inputBinding: + prefix: --umapMetric + type: string? outputs: outDir: outputBinding: From f60a414d756ea8b90bddc325b6c1c8552000f079 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 11:15:58 -0400 Subject: [PATCH 10/14] chore: moved code from other branch --- .../Dockerfile | 25 +++ .../README.md | 55 +++++++ .../VERSION | 1 + .../build-docker.sh | 22 +++ .../dimensionreductionqualitymetrics.cwl | 56 +++++++ .../ict.yaml | 129 ++++++++++++++++ .../plugin.json | 135 ++++++++++++++++ .../pyproject.toml | 35 +++++ .../run-plugin.sh | 22 +++ .../__init__.py | 94 ++++++++++++ .../__main__.py | 145 ++++++++++++++++++ .../metrics/__init__.py | 5 + .../metrics/fnn.py | 78 ++++++++++ .../tests/__init__.py | 1 + .../tests/conftest.py | 14 ++ .../dimension_reduction/__init__.py | 1 + 16 files changed, 818 insertions(+) create mode 100644 features/dimension-reduction-quality-metrics-tool/Dockerfile create mode 100644 features/dimension-reduction-quality-metrics-tool/README.md create mode 100644 features/dimension-reduction-quality-metrics-tool/VERSION create mode 100644 features/dimension-reduction-quality-metrics-tool/build-docker.sh create mode 100644 features/dimension-reduction-quality-metrics-tool/dimensionreductionqualitymetrics.cwl create mode 100644 features/dimension-reduction-quality-metrics-tool/ict.yaml create mode 100644 features/dimension-reduction-quality-metrics-tool/plugin.json create mode 100644 features/dimension-reduction-quality-metrics-tool/pyproject.toml create mode 100644 features/dimension-reduction-quality-metrics-tool/run-plugin.sh create mode 100644 features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py create mode 100644 features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py create mode 100644 features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py create mode 100644 features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py create mode 100644 features/dimension-reduction-quality-metrics-tool/tests/__init__.py create mode 100644 features/dimension-reduction-quality-metrics-tool/tests/conftest.py diff --git a/features/dimension-reduction-quality-metrics-tool/Dockerfile b/features/dimension-reduction-quality-metrics-tool/Dockerfile new file mode 100644 index 0000000..00331ca --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/Dockerfile @@ -0,0 +1,25 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".feather" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# TODO: Change the tool_dir to the tool directory +ENV TOOL_DIR="features/dimension-reduction-quality-metrics-tool" + +# Copy the repository into the container +RUN mkdir tabular-tools +COPY . ${EXEC_DIR}/tabular-tools + +# Install the tool +RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir + +# Set the entrypoint +# TODO: Change the entrypoint to the tool entrypoint +ENTRYPOINT ["python3", "-m", "polus.tabular.features.dimension_reduction_quality_metrics"] +CMD ["--help"] diff --git a/features/dimension-reduction-quality-metrics-tool/README.md b/features/dimension-reduction-quality-metrics-tool/README.md new file mode 100644 index 0000000..8646e15 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/README.md @@ -0,0 +1,55 @@ +# Dimension Reduction Quality Metrics (v0.1.0-dev0) + +This tool is used to measure the quality of dimensionality reductions. +It provides the following methods for dimensionality reduction: + +1. False Nearest Neighbors (FNN). + +## FNN + +Consider a query in the original space and some of its nearest neighbors. +Find the nearest neighbors of the query in the reduced space. +If the nearest neighbors in the reduced space are not the same as the nearest neighbors in the original space, then the reduced space is not a good representation of the original space. +FNN is the mean recall of the nearest neighbors in the reduced space over a large number of queries. + +## Parameters + +This tool takes the following parameters: + +1. `--originalDir`: Directory containing the original data. +2. `--originalPattern`: Pattern to parse original files. +3. `--embeddedDir`: Directory containing the reduced data. +4. `--embeddedPattern`: Pattern to parse reduced files. +5. `--numQueries`: Number of queries to use. +6. `--ks`: Comma separated list of numbers of nearest neighbors to consider. +7. `--distanceMetrics`: Comma separated list of distance metrics to use. +8. `--qualityMetrics`: Comma separated list of quality metrics to use. +9. `--outDir`: Output directory. +10. `--preview`: Generate JSON file with outputs without running the tool. + +## Docker Container + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. +Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes seven input arguments and one output argument: + +| Name | Description | I/O | Type | Default | +| ------------------- | --------------------------------------------------------- | ------ | ----------- | ------------------ | +| `--originalDir` | Directory containing the original data. | Input | genericData | N/A | +| `--originalPattern` | Pattern to parse original files. | Input | string | ".*" | +| `--embeddedDir` | Directory containing the reduced data. | Input | genericData | N/A | +| `--embeddedPattern` | Pattern to parse reduced files. | Input | string | ".*" | +| `--numQueries` | Number of queries to use. | Input | int | 1000 | +| `--ks` | Comma separated list of numbers of nearest neighbors. | Input | string | "10,100" | +| `--distanceMetrics` | Comma separated list of distance metrics to use. | Input | string | "euclidean,cosine" | +| `--qualityMetrics` | Comma separated list of quality metrics to use. | Input | string | "fnn" | +| `--outDir` | Output directory. | Output | genericData | N/A | +| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | diff --git a/features/dimension-reduction-quality-metrics-tool/VERSION b/features/dimension-reduction-quality-metrics-tool/VERSION new file mode 100644 index 0000000..6b1a238 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev1 diff --git a/features/dimension-reduction-quality-metrics-tool/build-docker.sh b/features/dimension-reduction-quality-metrics-tool/build-docker.sh new file mode 100644 index 0000000..8f573d0 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/build-docker.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# TODO: Change the name of the tool here +tool_dir="features" +tool_name="dimension-reduction-quality-metrics-tool" + +# The version is read from the VERSION file +version=$(", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.0" +typer = "^0.7.0" +numpy = "<2.0.0" +polus_tabular_transforms_dimension_reduction = { path = "../../transforms/dimension-reduction-tool", develop = true } +tqdm = "^4.66.4" +scipy = "^1.13" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +pytest = "^7.2.1" +pytest-sugar = "^1.0.0" +pytest-xdist = "^3.6.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +ignore = [ + "PLR0913", # Too many arguments to function call +] diff --git a/features/dimension-reduction-quality-metrics-tool/run-plugin.sh b/features/dimension-reduction-quality-metrics-tool/run-plugin.sh new file mode 100644 index 0000000..627e9a2 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/run-plugin.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +version=$( dict[int, dict[str, dict[str, float]]]: + """Measure the quality of the dimension reduction using different metrics. + + Args: + original_path: The path to the original data. + embedded_path: The path to the embedded data. + num_queries: The number of queries to use. + ks: The numbers of nearest neighbors to consider. + distance_metrics: The distance metrics to use. + quality_metrics: The quality metrics to compute. + + Returns: + A dictionary containing the computed metrics. The format is: + { + k_1: { + distance_metric_1: { + quality_metric_1: value, + quality_metric_2: value, + }, + distance_metric_2: { + quality_metric_1: value, + quality_metric_2: value, + }, + }, + k_2: { + distance_metric_1: { + quality_metric_1: value, + quality_metric_2: value, + }, + distance_metric_2: { + quality_metric_1: value, + quality_metric_2: value, + }, + }, + } + """ + original_data = Formats.read(original_path) + embedded_data = Formats.read(embedded_path) + + rng = numpy.random.default_rng() + query_indices = rng.choice( + original_data.shape[0], + size=num_queries, + replace=False, + ) + + quality: dict[int, dict[str, dict[str, float]]] = {} + for k in ks: + quality[k] = {} + for distance_metric in distance_metrics: + quality[k][distance_metric] = {} + for quality_metric in quality_metrics: + metric_func = getattr(metrics, quality_metric) + quality[k][distance_metric][quality_metric] = metric_func( + original_data=original_data, + embedded_data=embedded_data, + query_indices=query_indices, + n_neighbors=k, + distance_metric=distance_metric, + ) + + return quality + + +__all__ = [ + "measure_quality", + "POLUS_LOG_LVL", + "POLUS_TAB_EXT", + "__version__", +] diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py new file mode 100644 index 0000000..ffa45f2 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py @@ -0,0 +1,145 @@ +"""CLI for the Dimension Reduction tool.""" + +import json +import logging +import pathlib + +import filepattern +import tqdm +import typer +from polus.tabular.features.dimension_reduction_quality_metrics import measure_quality +from polus.tabular.transforms.dimension_reduction import POLUS_LOG_LVL + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.tabular.transforms.dimension_reduction") +logger.setLevel(POLUS_LOG_LVL) + +app = typer.Typer() + + +@app.command() +def main( + original_dir: pathlib.Path = typer.Option( + ..., + "--originalDir", + help="Directory containing the original data", + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + original_pattern: str = typer.Option( + ".*", + "--originalPattern", + help="pattern to parse tabular files for the original data", + ), + embedded_dir: pathlib.Path = typer.Option( + ..., + "--embeddedDir", + help="Directory containing the embedded data", + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + embedded_pattern: str = typer.Option( + ".*", + "--embeddedPattern", + help="pattern to parse tabular files for the embedded data", + ), + num_queries: int = typer.Option( + 1000, + "--numQueries", + help="Number of queries to use for the quality metrics", + ), + ks: str = typer.Option( + "10,100", + "--ks", + help="Comma-separated list of numbers of nearest neighbors to consider", + ), + distance_metrics: str = typer.Option( + "euclidean,cosine", + "--distanceMetrics", + help="Comma-separated list of distance metrics to use", + ), + quality_metrics: str = typer.Option( + "fnn", + "--qualityMetrics", + help="Comma-separated list of quality metrics to compute", + ), + out_dir: pathlib.Path = typer.Option( + ..., + "--outDir", + help="Output collection", + exists=True, + file_okay=False, + dir_okay=True, + writable=True, + resolve_path=True, + ), + preview: bool = typer.Option( + False, + "--preview", + help="Output a JSON preview of outputs produced by this tool", + ), +) -> None: + """CLI for the Dimension Reduction tool.""" + logger.info(f"originalDir = {original_dir}") + logger.info(f"originalPattern = {original_pattern}") + logger.info(f"embeddedDir = {embedded_dir}") + logger.info(f"embeddedPattern = {embedded_pattern}") + logger.info(f"numQueries = {num_queries}") + logger.info(f"ks = {ks}") + logger.info(f"distanceMetrics = {distance_metrics}") + logger.info(f"qualityMetrics = {quality_metrics}") + logger.info(f"outDir = {out_dir}") + logger.info(f"preview = {preview}") + + original_fp = filepattern.FilePattern(original_dir, original_pattern) + original_files = [pathlib.Path(p) for _, [p] in original_fp()] + original_dict = {f.stem: f for f in original_files} + + embedded_fp = filepattern.FilePattern(embedded_dir, embedded_pattern) + embedded_files = [pathlib.Path(p) for _, [p] in embedded_fp()] + embedded_dict = {f.stem: f for f in embedded_files} + + data_pairs: dict[str, tuple[pathlib.Path, pathlib.Path]] = {} + for stem in original_dict: + if stem in embedded_dict: + data_pairs[stem] = (original_dict[stem], embedded_dict[stem]) + else: + logger.warning(f"No matching embedded file found for {stem}") + for stem in embedded_dict: + if stem not in original_dict: + logger.warning(f"No matching original file found for {stem}") + + if preview: + logger.info(f"Previewing {len(data_pairs)} pairs of data") + msg = "Not implemented yet" + raise NotImplementedError(msg) + + for original_path, embedded_path in tqdm.tqdm( + data_pairs.values(), + total=len(data_pairs), + ): + out_path = out_dir / f"{original_path.stem}.json" + quality_metrics = measure_quality( + original_path=original_path, + embedded_path=embedded_path, + num_queries=num_queries, + ks=ks, + distance_metrics=distance_metrics, + quality_metrics=quality_metrics, + ) + with out_path.open("w") as f: + json.dump(quality_metrics, f) + + +if __name__ == "__main__": + app() diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py new file mode 100644 index 0000000..9cf49ac --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py @@ -0,0 +1,5 @@ +"""Different metrics for the quality of dimension reduction.""" + +from .fnn import fnn + +__all__ = ["fnn"] diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py new file mode 100644 index 0000000..45852e3 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py @@ -0,0 +1,78 @@ +"""False Nearest Neighbors (FNN) metric. + +Consider a query in the original space and some of its nearest neighbors. For +this query, find the nearest neighbors in the embedded space. FNN is the mean +recall of the nearest neighbors in the embedded space for a large enough number +of queries. + +Intuitively, if the embedding is good, the nearest neighbors in the original +space should also be the nearest neighbors in the embedded space. +""" + +import numpy +import scipy.spatial.distance + + +def fnn( + original_data: numpy.ndarray, + embedded_data: numpy.ndarray, + query_indices: numpy.ndarray, + n_neighbors: int, + distance_metric: str, +) -> float: + """Compute the False Nearest Neighbors (FNN) metric. + + Args: + original_data: The original data. + embedded_data: The embedded data. + query_indices: The indices of the queries in the original space. + n_neighbors: The number of nearest neighbors to consider. + distance_metric: The distance metric to use. + + Returns: + The FNN metric. + """ + original_knn = knn_search( + data=original_data, + queries=original_data[query_indices], + k=n_neighbors, + metric=distance_metric, + ) + + embedded_knn = knn_search( + data=embedded_data, + queries=embedded_data[query_indices], + k=n_neighbors, + metric=distance_metric, + ) + + recalls = [] + for i, _ in enumerate(query_indices): + original_neighbors = original_knn[i] + embedded_neighbors = embedded_knn[i] + recall = len(set(original_neighbors) & set(embedded_neighbors)) / n_neighbors + recalls.append(recall) + + return numpy.mean(recalls) + + +def knn_search( + data: numpy.ndarray, + queries: numpy.ndarray, + k: int, + metric: str, +) -> numpy.ndarray: + """Find the nearest neighbors of the queries in the data. + + Args: + data: The data. + queries: The queries. + k: The number of nearest neighbors to find. + metric: The distance metric to use. + + Returns: + The indices of the nearest neighbors. + """ + distances = scipy.spatial.distance.cdist(queries, data, metric) + sorted_indices = numpy.argsort(distances, axis=1) + return sorted_indices[:, :k] diff --git a/features/dimension-reduction-quality-metrics-tool/tests/__init__.py b/features/dimension-reduction-quality-metrics-tool/tests/__init__.py new file mode 100644 index 0000000..f2fe897 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the dimension reduction tool.""" diff --git a/features/dimension-reduction-quality-metrics-tool/tests/conftest.py b/features/dimension-reduction-quality-metrics-tool/tests/conftest.py new file mode 100644 index 0000000..5760cb9 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/conftest.py @@ -0,0 +1,14 @@ +"""Configuration for pytest.""" + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py index eb8d022..19bfd10 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py @@ -68,5 +68,6 @@ def reduce( "__version__", "SvdSolver", "Algorithm", + "Formats", "reduce", ] From e5f9945f41086152af577e8baaef618d07e679cd Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 13:25:39 -0400 Subject: [PATCH 11/14] chore: added bump2version --- transforms/dimension-reduction-tool/.bumpversion.cfg | 12 ++++++------ transforms/dimension-reduction-tool/VERSION | 2 +- .../transforms/dimension_reduction/__init__.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/transforms/dimension-reduction-tool/.bumpversion.cfg b/transforms/dimension-reduction-tool/.bumpversion.cfg index fa56a60..6c674a4 100644 --- a/transforms/dimension-reduction-tool/.bumpversion.cfg +++ b/transforms/dimension-reduction-tool/.bumpversion.cfg @@ -20,14 +20,14 @@ values = search = version = "{current_version}" replace = version = "{new_version}" -[bumpversion:file:plugin.json] - -[bumpversion:file:VERSION] +[bumpversion:file:src/polus/tabular/transforms/dimension_reduction/__init__.py] -[bumpversion:file:README.md] +[bumpversion:file:dimensionreduction.cwl] [bumpversion:file:ict.yaml] -[bumpversion:file:dimensionreduction.cwl] +[bumpversion:file:plugin.json] -[bumpversion:file:src/polus/tabular/transforms/dimension_reduction/__init__.py] +[bumpversion:file:README.md] + +[bumpversion:file:VERSION] diff --git a/transforms/dimension-reduction-tool/VERSION b/transforms/dimension-reduction-tool/VERSION index 6b1a238..206c085 100644 --- a/transforms/dimension-reduction-tool/VERSION +++ b/transforms/dimension-reduction-tool/VERSION @@ -1 +1 @@ -0.1.0-dev1 +0.1.0-dev0 diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py index 19bfd10..3ad81b9 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__init__.py @@ -16,7 +16,7 @@ POLUS_LOG_LVL = os.environ.get("POLUS_LOG", logging.INFO) POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".feather") -__version__ = "0.1.0-dev1" +__version__ = "0.1.0-dev0" def reduce( From ba2d8d743416b7f30d119d0f00e8b0c57980b575 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 13:27:21 -0400 Subject: [PATCH 12/14] chore: added bump2version --- .../.bumpversion.cfg | 33 +++++++++++++++++++ .../VERSION | 2 +- .../__init__.py | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg diff --git a/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg b/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg new file mode 100644 index 0000000..4c21105 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py] + +[bumpversion:file:dimensionreductionqualitymetrics.cwl] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:VERSION] diff --git a/features/dimension-reduction-quality-metrics-tool/VERSION b/features/dimension-reduction-quality-metrics-tool/VERSION index 6b1a238..206c085 100644 --- a/features/dimension-reduction-quality-metrics-tool/VERSION +++ b/features/dimension-reduction-quality-metrics-tool/VERSION @@ -1 +1 @@ -0.1.0-dev1 +0.1.0-dev0 diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py index 075edb2..92f95b4 100644 --- a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py @@ -12,7 +12,7 @@ POLUS_LOG_LVL = os.environ.get("POLUS_LOG", logging.INFO) POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".feather") -__version__ = "0.1.0-dev1" +__version__ = "0.1.0-dev0" def measure_quality( From 3ac3ff935bcbdc92ca279f11162b69c5d4159388 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 14:27:27 -0400 Subject: [PATCH 13/14] test: more robustness of tests --- .../pyproject.toml | 1 + .../metrics/fnn.py | 13 +- .../tests/test_fnn.py | 136 ++++++++++++++++++ .../algorithms/__init__.py | 1 - 4 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py diff --git a/features/dimension-reduction-quality-metrics-tool/pyproject.toml b/features/dimension-reduction-quality-metrics-tool/pyproject.toml index f553c20..076b9be 100644 --- a/features/dimension-reduction-quality-metrics-tool/pyproject.toml +++ b/features/dimension-reduction-quality-metrics-tool/pyproject.toml @@ -23,6 +23,7 @@ pre-commit = "^3.0.4" pytest = "^7.2.1" pytest-sugar = "^1.0.0" pytest-xdist = "^3.6.1" +scikit-learn = "^1.5.1" [build-system] requires = ["poetry-core"] diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py index 45852e3..9d5ed05 100644 --- a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py @@ -32,14 +32,14 @@ def fnn( Returns: The FNN metric. """ - original_knn = knn_search( + _, original_knn = knn_search( data=original_data, queries=original_data[query_indices], k=n_neighbors, metric=distance_metric, ) - embedded_knn = knn_search( + _, embedded_knn = knn_search( data=embedded_data, queries=embedded_data[query_indices], k=n_neighbors, @@ -61,7 +61,7 @@ def knn_search( queries: numpy.ndarray, k: int, metric: str, -) -> numpy.ndarray: +) -> tuple[numpy.ndarray, numpy.ndarray]: """Find the nearest neighbors of the queries in the data. Args: @@ -71,8 +71,11 @@ def knn_search( metric: The distance metric to use. Returns: - The indices of the nearest neighbors. + The distances and indices of the nearest neighbors. """ distances = scipy.spatial.distance.cdist(queries, data, metric) sorted_indices = numpy.argsort(distances, axis=1) - return sorted_indices[:, :k] + + k_indices = sorted_indices[:, :k] + k_distances = numpy.take_along_axis(distances, k_indices, axis=1) + return k_distances, k_indices diff --git a/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py b/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py new file mode 100644 index 0000000..1474a89 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py @@ -0,0 +1,136 @@ +"""Tests for the knn-search module.""" + +import numpy +import pytest +import sklearn.datasets + +from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import fnn +from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import ( + knn_search, +) +from polus.tabular.transforms.dimension_reduction.algorithms import umap + + +def test_knn_search(): + """Tests for knn-search.""" + + data = numpy.asarray( + [[i, i, i] for i in range(10)], + dtype=numpy.float32, + ) + queries = data[:2, :] + + assert data.shape[1] == queries.shape[1] + + k = 2 + metric = "euclidean" + dists, indices = knn_search(data, queries, k, metric) + + assert dists.shape == (queries.shape[0], k) + assert indices.shape == (queries.shape[0], k) + + expected_dists = numpy.sqrt( + numpy.asarray( + [[0.0, 3.0], [0.0, 3.0]], + dtype=numpy.float32, + ) + ) + numpy.testing.assert_allclose(dists, expected_dists) + + expected_indices = numpy.asarray( + [[0, 1], [1, 0]], + dtype=numpy.int32, + ) + numpy.testing.assert_array_equal(indices, expected_indices) + + +def gen_data(metric: str) -> tuple[numpy.ndarray, numpy.ndarray]: + digits = sklearn.datasets.load_digits() + original_data: numpy.ndarray = digits.data + embedded_data = umap.reduce( + data=original_data, + n_components=3, + n_neighbors=15, + metric=metric, + ) + return original_data, embedded_data + + +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_fnn(metric: str): + """Tests for False Nearest Neighbors (FNN).""" + + original_data, embedded_data = gen_data(metric) + for num_queries in [10, 100, 200]: + rng = numpy.random.default_rng() + query_indices = rng.choice( + original_data.shape[0], + size=num_queries, + replace=False, + ) + for k in [10, 100]: + fnn_metric = fnn( + original_data=original_data, + embedded_data=embedded_data, + query_indices=query_indices, + n_neighbors=k, + distance_metric=metric, + ) + + msg = f"metric: {metric}, k: {k}, num_queries: {num_queries}" + assert 0.0 <= fnn_metric <= 1.0, f"FNN: {fnn_metric:.6f}, {msg}" + expected_fnn = expected_failure_threshold( + num_queries=num_queries, + k=k, + metric=metric, + ) + assert ( + fnn_metric >= expected_fnn + ), f"FNN: {fnn_metric:.6f} < {expected_fnn:.6f}, {msg}" + + +def expected_failure_threshold( + num_queries: int, + k: int, + metric: str, +) -> float: + threshold = None + + # These thresholds are based on the averages of several measurements + if metric == "euclidean": + if k == 10: + if num_queries == 10: + threshold = 0.49 + elif num_queries == 100: + threshold = 0.60 + elif num_queries == 200: + threshold = 0.59 + elif k == 100: + if num_queries == 10: + threshold = 0.58 + elif num_queries == 100: + threshold = 0.65 + elif num_queries == 200: + threshold = 0.67 + elif metric == "cosine": + if k == 10: + if num_queries == 10: + threshold = 0.44 + elif num_queries == 100: + threshold = 0.45 + elif num_queries == 200: + threshold = 0.50 + elif k == 100: + if num_queries == 10: + threshold = 0.56 + elif num_queries == 100: + threshold = 0.65 + elif num_queries == 200: + threshold = 0.65 + + if threshold is None: + threshold = 0.0 # If the parameters are not in the table, return 0.0 + else: + threshold -= 0.1 # This gives us more leeway to pass the tests + + return threshold diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py index 62cc4b4..518e346 100644 --- a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py @@ -1,7 +1,6 @@ """Dimension Reduction algorithms supported by this tool.""" import enum -import typing from . import pca from . import tsne From f613a77ae3dacafae6fb694a268da6f44084bbaa Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Wed, 24 Jul 2024 15:35:26 -0400 Subject: [PATCH 14/14] test: marked many tests as slow --- .../tests/test_cli.py | 34 +-- .../tests/test_fast.py | 264 ++++++++++++++++++ .../tests/test_tool.py | 103 +++++-- 3 files changed, 350 insertions(+), 51 deletions(-) create mode 100644 transforms/dimension-reduction-tool/tests/test_fast.py diff --git a/transforms/dimension-reduction-tool/tests/test_cli.py b/transforms/dimension-reduction-tool/tests/test_cli.py index f829069..476ba03 100644 --- a/transforms/dimension-reduction-tool/tests/test_cli.py +++ b/transforms/dimension-reduction-tool/tests/test_cli.py @@ -4,12 +4,11 @@ import pathlib import tempfile -from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm - import numpy import pytest -import sklearn.datasets import typer.testing +import sklearn.datasets +from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm from polus.tabular.transforms.dimension_reduction.data_io import Formats from polus.tabular.transforms.dimension_reduction.__main__ import app @@ -37,35 +36,6 @@ def create_data(inp_format: str) -> tuple[pathlib.Path, pathlib.Path]: return inp_dir, out_dir -@pytest.mark.parametrize("inp_format", FORMATS) -@pytest.mark.parametrize("out_format", FORMATS) -def test_data_io(inp_format: str, out_format: str) -> None: - """Test data IO.""" - - inp_dir, out_dir = create_data(inp_format) - assert inp_dir.exists() - assert out_dir.exists() - - inp_files: list[pathlib.Path] = list(inp_dir.iterdir()) - - assert len(inp_files) == 1 - assert inp_files[0].name == "digits." + inp_format - - out_path = out_dir.joinpath(inp_files[0].stem + f".{out_format}") - inp_data = Formats.read(inp_dir.joinpath(inp_files[0])) - Formats.write(inp_data, out_path) - - out_files: list[pathlib.Path] = list(out_dir.iterdir()) - assert len(out_files) == 1 - assert out_files[0].name == "digits." + out_format - - out_data = Formats.read(out_path) - - assert inp_data.shape == out_data.shape - assert inp_data.dtype == out_data.dtype - numpy.testing.assert_allclose(inp_data, out_data) - - def gen_pca_args( svd_solver: list[str] = ["auto", "arpack"], tol: list[float] = [0.0, 0.1, 0.5, 1.0], diff --git a/transforms/dimension-reduction-tool/tests/test_fast.py b/transforms/dimension-reduction-tool/tests/test_fast.py new file mode 100644 index 0000000..fbf2464 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_fast.py @@ -0,0 +1,264 @@ +"""Fast tests for github actions.""" + +import copy +import pathlib +import tempfile + +import numpy +import pytest +import sklearn.datasets +import typer.testing +from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm +from polus.tabular.transforms.dimension_reduction import algorithms +from polus.tabular.transforms.dimension_reduction.data_io import Formats +from polus.tabular.transforms.dimension_reduction.__main__ import app + + +FORMATS = ["csv", "feather", "parquet", "npy"] + + +def create_data(inp_format: str) -> tuple[pathlib.Path, pathlib.Path]: + """Generate data.""" + + data_dir = pathlib.Path(tempfile.mkdtemp(suffix="_data_dir")) + + inp_dir = data_dir.joinpath("inp_dir") + inp_dir.mkdir() + + out_dir = data_dir.joinpath("out_dir") + out_dir.mkdir() + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + data = data.astype(numpy.float32) + Formats.write(data, inp_dir.joinpath(f"digits.{inp_format}")) + + return inp_dir, out_dir + + +@pytest.mark.parametrize("inp_format", FORMATS) +@pytest.mark.parametrize("out_format", FORMATS) +def test_data_io(inp_format: str, out_format: str) -> None: + """Test data IO.""" + + inp_dir, out_dir = create_data(inp_format) + assert inp_dir.exists() + assert out_dir.exists() + + inp_files: list[pathlib.Path] = list(inp_dir.iterdir()) + + assert len(inp_files) == 1 + assert inp_files[0].name == "digits." + inp_format + + out_path = out_dir.joinpath(inp_files[0].stem + f".{out_format}") + inp_data = Formats.read(inp_dir.joinpath(inp_files[0])) + Formats.write(inp_data, out_path) + + out_files: list[pathlib.Path] = list(out_dir.iterdir()) + assert len(out_files) == 1 + assert out_files[0].name == "digits." + out_format + + out_data = Formats.read(out_path) + + assert inp_data.shape == out_data.shape + assert inp_data.dtype == out_data.dtype + numpy.testing.assert_allclose(inp_data, out_data) + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("whiten", [False]) +@pytest.mark.parametrize("svd_solver", [algorithms.SvdSolver.AUTO]) +@pytest.mark.parametrize("tol", [0.0]) +def test_pca( + n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, +): + """Test the PCA algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.pca.reduce( + data.astype(numpy.float32), + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver, + tol=tol, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == data.shape[0] + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("perplexity", [30.0]) +@pytest.mark.parametrize("early_exaggeration", [12.0]) +@pytest.mark.parametrize("learning_rate", ["auto"]) +@pytest.mark.parametrize("max_iter", [250]) +@pytest.mark.parametrize("metric", ["euclidean"]) +def test_tsne( + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce( + data.astype(numpy.float32), + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("pca_n_components", [10]) +@pytest.mark.parametrize("whiten", [False]) +@pytest.mark.parametrize("svd_solver", [algorithms.SvdSolver.AUTO]) +@pytest.mark.parametrize("tol", [0.0]) +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("perplexity", [30.0]) +@pytest.mark.parametrize("early_exaggeration", [12.0]) +@pytest.mark.parametrize("learning_rate", ["auto"]) +@pytest.mark.parametrize("max_iter", [250]) +@pytest.mark.parametrize("metric", ["euclidean"]) +def test_tsne_pca( + pca_n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm with PCA initialization.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce_init_pca( + data.astype(numpy.float32), + pca_n_components=pca_n_components, + pca_whiten=whiten, + pca_svd_solver=svd_solver, + pca_tol=tol, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("n_neighbors", [15]) +@pytest.mark.parametrize("metric", ["euclidean"]) +@pytest.mark.parametrize("n_epochs", [200]) +@pytest.mark.parametrize("min_dist", [0.1]) +@pytest.mark.parametrize("spread", [1.0]) +def test_umap( + n_components: int, + n_neighbors: int, + metric: str, + n_epochs: int, + min_dist: float, + spread: float, +): + """Test the UMAP algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.umap.reduce( + data.astype(numpy.float32), + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +def test_cli(): + inp_dir, out_dir = create_data("csv") + + args = [ + "--inpDir", + str(inp_dir), + "--nComponents", + "3", + "--algorithm", + "umap", + "--umapNNeighbors", + "15", + "--umapNEpochs", + "200", + "--umapMinDist", + "0.1", + "--umapSpread", + "1.0", + "--umapMetric", + "euclidean", + "--outDir", + str(out_dir), + ] + + runner = typer.testing.CliRunner() + result = runner.invoke(app, args) + + assert result.exit_code == 0 + + inp_files = list(map(pathlib.Path, inp_dir.iterdir())) + out_files = list(map(pathlib.Path, out_dir.iterdir())) + + assert len(inp_files) == 1 + assert len(out_files) == 1 + + for inp_path in inp_files: + out_path = out_dir.joinpath(inp_path.stem + ".feather") + msg = f"Missing {inp_path.stem} from {inp_files} in {out_files}\n{args}" + assert out_path in out_files, msg + + data = Formats.read(out_path) + assert data.shape == (1797, 3) + assert data.dtype == numpy.float32 diff --git a/transforms/dimension-reduction-tool/tests/test_tool.py b/transforms/dimension-reduction-tool/tests/test_tool.py index 99bf3f1..7176e30 100644 --- a/transforms/dimension-reduction-tool/tests/test_tool.py +++ b/transforms/dimension-reduction-tool/tests/test_tool.py @@ -14,6 +14,7 @@ ] +@pytest.mark.skipif("not config.getoption('slow')") @pytest.mark.parametrize("n_components", [2, 10]) @pytest.mark.parametrize("whiten", [True, False]) @pytest.mark.parametrize("svd_solver", SVD_SOLVERS) @@ -25,6 +26,16 @@ def test_pca( tol: float, ): """Test the PCA algorithm.""" + if all( + ( + n_components == 2, + whiten is False, + svd_solver == algorithms.SvdSolver.AUTO, + tol == 0.0, + ) + ): + # This test has been handled in `test_fast.py` + return digits = sklearn.datasets.load_digits() data: numpy.ndarray = digits.data @@ -45,10 +56,11 @@ def test_pca( assert reduced.dtype == numpy.float32 -@pytest.mark.parametrize("n_components", [3]) -@pytest.mark.parametrize("perplexity", [5.0, 50.0]) -@pytest.mark.parametrize("early_exaggeration", [5.0, 20.0]) -@pytest.mark.parametrize("learning_rate", [200.0, "auto"]) +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("n_components", [2, 3, 10]) +@pytest.mark.parametrize("perplexity", [5.0, 30.0, 50.0]) +@pytest.mark.parametrize("early_exaggeration", [5.0, 12.0, 20.0]) +@pytest.mark.parametrize("learning_rate", [50.0, 100.0, 200.0, 500.0, 1000.0, "auto"]) @pytest.mark.parametrize("max_iter", [250, 1000]) @pytest.mark.parametrize("metric", ["euclidean", "cosine"]) def test_tsne( @@ -60,6 +72,18 @@ def test_tsne( metric: str, ): """Test the t-SNE algorithm.""" + if all( + ( + n_components == 2, + perplexity == 30.0, + early_exaggeration == 12.0, + learning_rate == "auto", + max_iter == 250, + metric == "euclidean", + ) + ): + # This test has been handled in `test_fast.py` + return digits = sklearn.datasets.load_digits() data: numpy.ndarray = digits.data @@ -82,15 +106,43 @@ def test_tsne( assert reduced.dtype == numpy.float32 -@pytest.mark.parametrize("n_components", [2, 3]) +@pytest.mark.skipif("not config.getoption('slow')") @pytest.mark.parametrize("pca_n_components", [10, 50]) -@pytest.mark.parametrize("perplexity", [5.0, 50.0]) -def test_tsne_pca( - n_components: int, +@pytest.mark.parametrize("whiten", [False, True]) +@pytest.mark.parametrize("svd_solver", SVD_SOLVERS) +@pytest.mark.parametrize("tol", [0.0, 0.5]) +@pytest.mark.parametrize("n_components", [2, 3]) +@pytest.mark.parametrize("perplexity", [5.0, 30.0, 50.0]) +@pytest.mark.parametrize("early_exaggeration", [5.0, 12.0, 20.0]) +@pytest.mark.parametrize("learning_rate", [50.0, 100.0, 200.0, 500.0, 1000.0, "auto"]) +@pytest.mark.parametrize("max_iter", [250, 1000]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_tsne_init_pca( pca_n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, + n_components: int, perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, ): """Test the t-SNE algorithm with PCA initialization.""" + if all( + ( + pca_n_components == 10, + n_components == 2, + perplexity == 30.0, + early_exaggeration == 12.0, + learning_rate == "auto", + max_iter == 250, + metric == "euclidean", + ) + ): + # This test has been handled in `test_fast.py` + return digits = sklearn.datasets.load_digits() data: numpy.ndarray = digits.data @@ -100,15 +152,15 @@ def test_tsne_pca( reduced = algorithms.tsne.reduce_init_pca( data.astype(numpy.float32), pca_n_components=pca_n_components, - pca_whiten=False, - pca_svd_solver=algorithms.SvdSolver.AUTO, - pca_tol=0.0, + pca_whiten=whiten, + pca_svd_solver=svd_solver, + pca_tol=tol, n_components=n_components, perplexity=perplexity, - early_exaggeration=12.0, - learning_rate="auto", - max_iter=1000, - metric="euclidean", + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, ) assert reduced.ndim == data.ndim @@ -117,11 +169,12 @@ def test_tsne_pca( assert reduced.dtype == numpy.float32 -@pytest.mark.parametrize("n_components", [3, 10]) -@pytest.mark.parametrize("n_neighbors", [10, 25]) +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("n_components", [2, 3, 10]) +@pytest.mark.parametrize("n_neighbors", [5, 15, 50]) @pytest.mark.parametrize("metric", ["euclidean", "cosine"]) -@pytest.mark.parametrize("n_epochs", [None, 100]) -@pytest.mark.parametrize("min_dist", [0.05, 0.2]) +@pytest.mark.parametrize("n_epochs", [None, 200, 500]) +@pytest.mark.parametrize("min_dist", [0.05, 0.1, 0.2]) @pytest.mark.parametrize("spread", [1.0, 2.0]) def test_umap( n_components: int, @@ -132,6 +185,18 @@ def test_umap( spread: float, ): """Test the UMAP algorithm.""" + if all( + ( + n_components == 2, + n_neighbors == 15, + metric == "euclidean", + n_epochs == 200, + min_dist == 0.1, + spread == 1.0, + ) + ): + # This test has been handled in `test_fast.py` + return digits = sklearn.datasets.load_digits() data: numpy.ndarray = digits.data