diff --git a/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg b/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg new file mode 100644 index 0000000..4c21105 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:src/polus/tabular/features/dimension_reduction_quality_metrics/__init__.py] + +[bumpversion:file:dimensionreductionqualitymetrics.cwl] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:VERSION] diff --git a/features/dimension-reduction-quality-metrics-tool/Dockerfile b/features/dimension-reduction-quality-metrics-tool/Dockerfile new file mode 100644 index 0000000..00331ca --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/Dockerfile @@ -0,0 +1,25 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".feather" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# TODO: Change the tool_dir to the tool directory +ENV TOOL_DIR="features/dimension-reduction-quality-metrics-tool" + +# Copy the repository into the container +RUN mkdir tabular-tools +COPY . ${EXEC_DIR}/tabular-tools + +# Install the tool +RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir + +# Set the entrypoint +# TODO: Change the entrypoint to the tool entrypoint +ENTRYPOINT ["python3", "-m", "polus.tabular.features.dimension_reduction_quality_metrics"] +CMD ["--help"] diff --git a/features/dimension-reduction-quality-metrics-tool/README.md b/features/dimension-reduction-quality-metrics-tool/README.md new file mode 100644 index 0000000..8646e15 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/README.md @@ -0,0 +1,55 @@ +# Dimension Reduction Quality Metrics (v0.1.0-dev0) + +This tool is used to measure the quality of dimensionality reductions. +It provides the following methods for dimensionality reduction: + +1. False Nearest Neighbors (FNN). + +## FNN + +Consider a query in the original space and some of its nearest neighbors. +Find the nearest neighbors of the query in the reduced space. +If the nearest neighbors in the reduced space are not the same as the nearest neighbors in the original space, then the reduced space is not a good representation of the original space. +FNN is the mean recall of the nearest neighbors in the reduced space over a large number of queries. + +## Parameters + +This tool takes the following parameters: + +1. `--originalDir`: Directory containing the original data. +2. `--originalPattern`: Pattern to parse original files. +3. `--embeddedDir`: Directory containing the reduced data. +4. `--embeddedPattern`: Pattern to parse reduced files. +5. `--numQueries`: Number of queries to use. +6. `--ks`: Comma separated list of numbers of nearest neighbors to consider. +7. `--distanceMetrics`: Comma separated list of distance metrics to use. +8. `--qualityMetrics`: Comma separated list of quality metrics to use. +9. `--outDir`: Output directory. +10. `--preview`: Generate JSON file with outputs without running the tool. + +## Docker Container + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. +Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes seven input arguments and one output argument: + +| Name | Description | I/O | Type | Default | +| ------------------- | --------------------------------------------------------- | ------ | ----------- | ------------------ | +| `--originalDir` | Directory containing the original data. | Input | genericData | N/A | +| `--originalPattern` | Pattern to parse original files. | Input | string | ".*" | +| `--embeddedDir` | Directory containing the reduced data. | Input | genericData | N/A | +| `--embeddedPattern` | Pattern to parse reduced files. | Input | string | ".*" | +| `--numQueries` | Number of queries to use. | Input | int | 1000 | +| `--ks` | Comma separated list of numbers of nearest neighbors. | Input | string | "10,100" | +| `--distanceMetrics` | Comma separated list of distance metrics to use. | Input | string | "euclidean,cosine" | +| `--qualityMetrics` | Comma separated list of quality metrics to use. | Input | string | "fnn" | +| `--outDir` | Output directory. | Output | genericData | N/A | +| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | diff --git a/features/dimension-reduction-quality-metrics-tool/VERSION b/features/dimension-reduction-quality-metrics-tool/VERSION new file mode 100644 index 0000000..206c085 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev0 diff --git a/features/dimension-reduction-quality-metrics-tool/build-docker.sh b/features/dimension-reduction-quality-metrics-tool/build-docker.sh new file mode 100644 index 0000000..8f573d0 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/build-docker.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# TODO: Change the name of the tool here +tool_dir="features" +tool_name="dimension-reduction-quality-metrics-tool" + +# The version is read from the VERSION file +version=$(", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.0" +typer = "^0.7.0" +numpy = "<2.0.0" +polus_tabular_transforms_dimension_reduction = { path = "../../transforms/dimension-reduction-tool", develop = true } +tqdm = "^4.66.4" +scipy = "^1.13" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +pytest = "^7.2.1" +pytest-sugar = "^1.0.0" +pytest-xdist = "^3.6.1" +scikit-learn = "^1.5.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +ignore = [ + "PLR0913", # Too many arguments to function call +] diff --git a/features/dimension-reduction-quality-metrics-tool/run-plugin.sh b/features/dimension-reduction-quality-metrics-tool/run-plugin.sh new file mode 100644 index 0000000..627e9a2 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/run-plugin.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +version=$( dict[int, dict[str, dict[str, float]]]: + """Measure the quality of the dimension reduction using different metrics. + + Args: + original_path: The path to the original data. + embedded_path: The path to the embedded data. + num_queries: The number of queries to use. + ks: The numbers of nearest neighbors to consider. + distance_metrics: The distance metrics to use. + quality_metrics: The quality metrics to compute. + + Returns: + A dictionary containing the computed metrics. The format is: + { + k_1: { + distance_metric_1: { + quality_metric_1: value, + quality_metric_2: value, + }, + distance_metric_2: { + quality_metric_1: value, + quality_metric_2: value, + }, + }, + k_2: { + distance_metric_1: { + quality_metric_1: value, + quality_metric_2: value, + }, + distance_metric_2: { + quality_metric_1: value, + quality_metric_2: value, + }, + }, + } + """ + original_data = Formats.read(original_path) + embedded_data = Formats.read(embedded_path) + + rng = numpy.random.default_rng() + query_indices = rng.choice( + original_data.shape[0], + size=num_queries, + replace=False, + ) + + quality: dict[int, dict[str, dict[str, float]]] = {} + for k in ks: + quality[k] = {} + for distance_metric in distance_metrics: + quality[k][distance_metric] = {} + for quality_metric in quality_metrics: + metric_func = getattr(metrics, quality_metric) + quality[k][distance_metric][quality_metric] = metric_func( + original_data=original_data, + embedded_data=embedded_data, + query_indices=query_indices, + n_neighbors=k, + distance_metric=distance_metric, + ) + + return quality + + +__all__ = [ + "measure_quality", + "POLUS_LOG_LVL", + "POLUS_TAB_EXT", + "__version__", +] diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py new file mode 100644 index 0000000..ffa45f2 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/__main__.py @@ -0,0 +1,145 @@ +"""CLI for the Dimension Reduction tool.""" + +import json +import logging +import pathlib + +import filepattern +import tqdm +import typer +from polus.tabular.features.dimension_reduction_quality_metrics import measure_quality +from polus.tabular.transforms.dimension_reduction import POLUS_LOG_LVL + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.tabular.transforms.dimension_reduction") +logger.setLevel(POLUS_LOG_LVL) + +app = typer.Typer() + + +@app.command() +def main( + original_dir: pathlib.Path = typer.Option( + ..., + "--originalDir", + help="Directory containing the original data", + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + original_pattern: str = typer.Option( + ".*", + "--originalPattern", + help="pattern to parse tabular files for the original data", + ), + embedded_dir: pathlib.Path = typer.Option( + ..., + "--embeddedDir", + help="Directory containing the embedded data", + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + embedded_pattern: str = typer.Option( + ".*", + "--embeddedPattern", + help="pattern to parse tabular files for the embedded data", + ), + num_queries: int = typer.Option( + 1000, + "--numQueries", + help="Number of queries to use for the quality metrics", + ), + ks: str = typer.Option( + "10,100", + "--ks", + help="Comma-separated list of numbers of nearest neighbors to consider", + ), + distance_metrics: str = typer.Option( + "euclidean,cosine", + "--distanceMetrics", + help="Comma-separated list of distance metrics to use", + ), + quality_metrics: str = typer.Option( + "fnn", + "--qualityMetrics", + help="Comma-separated list of quality metrics to compute", + ), + out_dir: pathlib.Path = typer.Option( + ..., + "--outDir", + help="Output collection", + exists=True, + file_okay=False, + dir_okay=True, + writable=True, + resolve_path=True, + ), + preview: bool = typer.Option( + False, + "--preview", + help="Output a JSON preview of outputs produced by this tool", + ), +) -> None: + """CLI for the Dimension Reduction tool.""" + logger.info(f"originalDir = {original_dir}") + logger.info(f"originalPattern = {original_pattern}") + logger.info(f"embeddedDir = {embedded_dir}") + logger.info(f"embeddedPattern = {embedded_pattern}") + logger.info(f"numQueries = {num_queries}") + logger.info(f"ks = {ks}") + logger.info(f"distanceMetrics = {distance_metrics}") + logger.info(f"qualityMetrics = {quality_metrics}") + logger.info(f"outDir = {out_dir}") + logger.info(f"preview = {preview}") + + original_fp = filepattern.FilePattern(original_dir, original_pattern) + original_files = [pathlib.Path(p) for _, [p] in original_fp()] + original_dict = {f.stem: f for f in original_files} + + embedded_fp = filepattern.FilePattern(embedded_dir, embedded_pattern) + embedded_files = [pathlib.Path(p) for _, [p] in embedded_fp()] + embedded_dict = {f.stem: f for f in embedded_files} + + data_pairs: dict[str, tuple[pathlib.Path, pathlib.Path]] = {} + for stem in original_dict: + if stem in embedded_dict: + data_pairs[stem] = (original_dict[stem], embedded_dict[stem]) + else: + logger.warning(f"No matching embedded file found for {stem}") + for stem in embedded_dict: + if stem not in original_dict: + logger.warning(f"No matching original file found for {stem}") + + if preview: + logger.info(f"Previewing {len(data_pairs)} pairs of data") + msg = "Not implemented yet" + raise NotImplementedError(msg) + + for original_path, embedded_path in tqdm.tqdm( + data_pairs.values(), + total=len(data_pairs), + ): + out_path = out_dir / f"{original_path.stem}.json" + quality_metrics = measure_quality( + original_path=original_path, + embedded_path=embedded_path, + num_queries=num_queries, + ks=ks, + distance_metrics=distance_metrics, + quality_metrics=quality_metrics, + ) + with out_path.open("w") as f: + json.dump(quality_metrics, f) + + +if __name__ == "__main__": + app() diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py new file mode 100644 index 0000000..9cf49ac --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/__init__.py @@ -0,0 +1,5 @@ +"""Different metrics for the quality of dimension reduction.""" + +from .fnn import fnn + +__all__ = ["fnn"] diff --git a/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py new file mode 100644 index 0000000..9d5ed05 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/src/polus/tabular/features/dimension_reduction_quality_metrics/metrics/fnn.py @@ -0,0 +1,81 @@ +"""False Nearest Neighbors (FNN) metric. + +Consider a query in the original space and some of its nearest neighbors. For +this query, find the nearest neighbors in the embedded space. FNN is the mean +recall of the nearest neighbors in the embedded space for a large enough number +of queries. + +Intuitively, if the embedding is good, the nearest neighbors in the original +space should also be the nearest neighbors in the embedded space. +""" + +import numpy +import scipy.spatial.distance + + +def fnn( + original_data: numpy.ndarray, + embedded_data: numpy.ndarray, + query_indices: numpy.ndarray, + n_neighbors: int, + distance_metric: str, +) -> float: + """Compute the False Nearest Neighbors (FNN) metric. + + Args: + original_data: The original data. + embedded_data: The embedded data. + query_indices: The indices of the queries in the original space. + n_neighbors: The number of nearest neighbors to consider. + distance_metric: The distance metric to use. + + Returns: + The FNN metric. + """ + _, original_knn = knn_search( + data=original_data, + queries=original_data[query_indices], + k=n_neighbors, + metric=distance_metric, + ) + + _, embedded_knn = knn_search( + data=embedded_data, + queries=embedded_data[query_indices], + k=n_neighbors, + metric=distance_metric, + ) + + recalls = [] + for i, _ in enumerate(query_indices): + original_neighbors = original_knn[i] + embedded_neighbors = embedded_knn[i] + recall = len(set(original_neighbors) & set(embedded_neighbors)) / n_neighbors + recalls.append(recall) + + return numpy.mean(recalls) + + +def knn_search( + data: numpy.ndarray, + queries: numpy.ndarray, + k: int, + metric: str, +) -> tuple[numpy.ndarray, numpy.ndarray]: + """Find the nearest neighbors of the queries in the data. + + Args: + data: The data. + queries: The queries. + k: The number of nearest neighbors to find. + metric: The distance metric to use. + + Returns: + The distances and indices of the nearest neighbors. + """ + distances = scipy.spatial.distance.cdist(queries, data, metric) + sorted_indices = numpy.argsort(distances, axis=1) + + k_indices = sorted_indices[:, :k] + k_distances = numpy.take_along_axis(distances, k_indices, axis=1) + return k_distances, k_indices diff --git a/features/dimension-reduction-quality-metrics-tool/tests/__init__.py b/features/dimension-reduction-quality-metrics-tool/tests/__init__.py new file mode 100644 index 0000000..f2fe897 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the dimension reduction tool.""" diff --git a/features/dimension-reduction-quality-metrics-tool/tests/conftest.py b/features/dimension-reduction-quality-metrics-tool/tests/conftest.py new file mode 100644 index 0000000..5760cb9 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/conftest.py @@ -0,0 +1,14 @@ +"""Configuration for pytest.""" + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py b/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py new file mode 100644 index 0000000..1474a89 --- /dev/null +++ b/features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py @@ -0,0 +1,136 @@ +"""Tests for the knn-search module.""" + +import numpy +import pytest +import sklearn.datasets + +from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import fnn +from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import ( + knn_search, +) +from polus.tabular.transforms.dimension_reduction.algorithms import umap + + +def test_knn_search(): + """Tests for knn-search.""" + + data = numpy.asarray( + [[i, i, i] for i in range(10)], + dtype=numpy.float32, + ) + queries = data[:2, :] + + assert data.shape[1] == queries.shape[1] + + k = 2 + metric = "euclidean" + dists, indices = knn_search(data, queries, k, metric) + + assert dists.shape == (queries.shape[0], k) + assert indices.shape == (queries.shape[0], k) + + expected_dists = numpy.sqrt( + numpy.asarray( + [[0.0, 3.0], [0.0, 3.0]], + dtype=numpy.float32, + ) + ) + numpy.testing.assert_allclose(dists, expected_dists) + + expected_indices = numpy.asarray( + [[0, 1], [1, 0]], + dtype=numpy.int32, + ) + numpy.testing.assert_array_equal(indices, expected_indices) + + +def gen_data(metric: str) -> tuple[numpy.ndarray, numpy.ndarray]: + digits = sklearn.datasets.load_digits() + original_data: numpy.ndarray = digits.data + embedded_data = umap.reduce( + data=original_data, + n_components=3, + n_neighbors=15, + metric=metric, + ) + return original_data, embedded_data + + +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_fnn(metric: str): + """Tests for False Nearest Neighbors (FNN).""" + + original_data, embedded_data = gen_data(metric) + for num_queries in [10, 100, 200]: + rng = numpy.random.default_rng() + query_indices = rng.choice( + original_data.shape[0], + size=num_queries, + replace=False, + ) + for k in [10, 100]: + fnn_metric = fnn( + original_data=original_data, + embedded_data=embedded_data, + query_indices=query_indices, + n_neighbors=k, + distance_metric=metric, + ) + + msg = f"metric: {metric}, k: {k}, num_queries: {num_queries}" + assert 0.0 <= fnn_metric <= 1.0, f"FNN: {fnn_metric:.6f}, {msg}" + expected_fnn = expected_failure_threshold( + num_queries=num_queries, + k=k, + metric=metric, + ) + assert ( + fnn_metric >= expected_fnn + ), f"FNN: {fnn_metric:.6f} < {expected_fnn:.6f}, {msg}" + + +def expected_failure_threshold( + num_queries: int, + k: int, + metric: str, +) -> float: + threshold = None + + # These thresholds are based on the averages of several measurements + if metric == "euclidean": + if k == 10: + if num_queries == 10: + threshold = 0.49 + elif num_queries == 100: + threshold = 0.60 + elif num_queries == 200: + threshold = 0.59 + elif k == 100: + if num_queries == 10: + threshold = 0.58 + elif num_queries == 100: + threshold = 0.65 + elif num_queries == 200: + threshold = 0.67 + elif metric == "cosine": + if k == 10: + if num_queries == 10: + threshold = 0.44 + elif num_queries == 100: + threshold = 0.45 + elif num_queries == 200: + threshold = 0.50 + elif k == 100: + if num_queries == 10: + threshold = 0.56 + elif num_queries == 100: + threshold = 0.65 + elif num_queries == 200: + threshold = 0.65 + + if threshold is None: + threshold = 0.0 # If the parameters are not in the table, return 0.0 + else: + threshold -= 0.1 # This gives us more leeway to pass the tests + + return threshold diff --git a/transforms/dimension-reduction-tool/.bumpversion.cfg b/transforms/dimension-reduction-tool/.bumpversion.cfg new file mode 100644 index 0000000..6c674a4 --- /dev/null +++ b/transforms/dimension-reduction-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:src/polus/tabular/transforms/dimension_reduction/__init__.py] + +[bumpversion:file:dimensionreduction.cwl] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:VERSION] diff --git a/transforms/dimension-reduction-tool/Dockerfile b/transforms/dimension-reduction-tool/Dockerfile new file mode 100644 index 0000000..61df5e5 --- /dev/null +++ b/transforms/dimension-reduction-tool/Dockerfile @@ -0,0 +1,25 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".feather" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# TODO: Change the tool_dir to the tool directory +ENV TOOL_DIR="transforms/dimension-reduction-tool" + +# Copy the repository into the container +RUN mkdir tabular-tools +COPY . ${EXEC_DIR}/tabular-tools + +# Install the tool +RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir + +# Set the entrypoint +# TODO: Change the entrypoint to the tool entrypoint +ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.dimension_reduction"] +CMD ["--help"] diff --git a/transforms/dimension-reduction-tool/README.md b/transforms/dimension-reduction-tool/README.md new file mode 100644 index 0000000..85f8d15 --- /dev/null +++ b/transforms/dimension-reduction-tool/README.md @@ -0,0 +1,94 @@ +# Dimension Reduction (v0.1.0-dev0) + +This tool is used to reduce the dimensionality of the input data. +It provides the following methods for dimensionality reduction: + +1. Principal Component Analysis (PCA) +2. t-Distributed Stochastic Neighbor Embedding (t-SNE) +3. t-SNE with PCA initialization. +4. Uniform Manifold Approximation and Projection (UMAP) + +The input data should be in the form of a tabular file (`.csv`, `.feather`, `parquet` or `npy`). +This tool takes tabular data as input and outputs a reduced dimensionality version of the input data. +Each method has its own set of parameters that can be tuned to get the desired output. + +The CLI parameters required for all methods are: + +1. `--inpDir`: Directory containing input tabular data. +2. `--filePattern`: Pattern to parse tabular files. +3. `--algorithm`: Dimensionality reduction algorithm to use. Options are `pca`, `tsne`, `tsne_init_pca`, and `umap`. +4. `--nComponents`: Number of dimensions to reduce to. +5. `--outDir`: Output directory. + +You can also use the `--preview` flag to generate a JSON file indicating what the outputs would be without running the tool. + +For PCA, the required parameters are: + +- `--pcaWhiten`: Boolean flag to indicate whether to whiten the data. +- `--pcaSvdSolver`: Solver to use for PCA. Options are `auto`, `full`, `arpack`, and `randomized`. +- `--pcaTol`: Tolerance for PCA with the `arpack` solver. + +For more details in each parameter, see [the documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html). + +For t-SNE, the required parameters are: + +- `--tsnePerplexity`: Perplexity parameter for t-SNE. +- `--tsneEarlyExaggeration`: Early exaggeration factor for t-SNE. +- `--tsneLearningRate`: Learning rate for t-SNE. +- `--tsneMaxIter`: Maximum number of iterations for t-SNE. +- `--tsneMetric`: The distance metric to use for t-SNE. + +for more details in each parameter, see [the documentation here](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). + +For t-SNE with PCA initialization, the required parameters are: + +- All parameters required for t-SNE. +- `--tsneInitNComponents`: Number of components to use for PCA initialization. +- All parameters required for PCA. + +For UMAP, the required parameters are: + +- `--umapNNeighbors`: Number of neighbors to use for UMAP. +- `--umapNEpochs`: Number of epochs for UMAP. +- `--umapMinDist`: Minimum distance between points in UMAP. +- `--umapSpread`: Spread of UMAP. +- `--umapMetric`: The distance metric to use for UMAP. + +For more details in each parameter, see [the documentation here](https://umap-learn.readthedocs.io/en/latest/parameters.html). + +## Docker Container + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. +Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes seven input arguments and one output argument: + +| Name | Description | I/O | Type | Default | +| ------------------------- | --------------------------------------------------------- | ------ | ----------- | --------- | +| `--inpDir` | Directory containing input tabular data. | Input | genericData | N/A | +| `--filePattern` | Pattern to parse tabular files. | Input | string | ".*" | +| `--preview` | Generate JSON file with outputs without running the tool. | Input | boolean | False | +| `--outDir` | Output directory. | Output | genericData | N/A | +| `--algorithm` | Dimensionality reduction algorithm to use. | Input | enum | umap | +| `--nComponents` | Number of dimensions to reduce to. | Input | int | | +| `--pcaWhiten` | Boolean flag to indicate whether to whiten the data. | Input | boolean | False | +| `--pcaSvdSolver` | Solver to use for PCA. | Input | enum | auto | +| `--pcaTol` | Tolerance for PCA with the `arpack` solver. | Input | float | 0.0 | +| `--tsnePerplexity` | Perplexity parameter for t-SNE. | Input | float | 30.0 | +| `--tsneEarlyExaggeration` | Early exaggeration factor for t-SNE. | Input | float | 12.0 | +| `--tsneLearningRate` | Learning rate for t-SNE. | Input | float | 200.0 | +| `--tsneMaxIter` | Maximum number of iterations for t-SNE. | Input | int | 1000 | +| `--tsneMetric` | The distance metric to use for t-SNE. | Input | string | euclidean | +| `--tsneInitNComponents` | Number of components to use for PCA initialization. | Input | int | 50 | +| `--umapNNeighbors` | Number of neighbors to use for UMAP. | Input | int | 15 | +| `--umapNEpochs` | Number of epochs for UMAP. | Input | int | 500 | +| `--umapMinDist` | Minimum distance between points in UMAP. | Input | float | 0.1 | +| `--umapSpread` | Spread of UMAP. | Input | float | 1.0 | +| `--umapMetric` | The distance metric to use for UMAP. | Input | string | euclidean | diff --git a/transforms/dimension-reduction-tool/VERSION b/transforms/dimension-reduction-tool/VERSION new file mode 100644 index 0000000..206c085 --- /dev/null +++ b/transforms/dimension-reduction-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev0 diff --git a/transforms/dimension-reduction-tool/build-docker.sh b/transforms/dimension-reduction-tool/build-docker.sh new file mode 100644 index 0000000..ede49ff --- /dev/null +++ b/transforms/dimension-reduction-tool/build-docker.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# TODO: Change the name of the tool here +tool_dir="transforms" +tool_name="dimension-reduction-tool" + +# The version is read from the VERSION file +version=$(", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.0" +typer = "^0.7.0" +numpy = "<2.0.0" +scikit-learn = "^1.5.1" +umap-learn = "^0.5.6" +pyarrow = ">=16.0,<17.0" +pandas = "^2.2.2" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +pytest = "^7.2.1" +pytest-sugar = "^1.0.0" +pytest-xdist = "^3.6.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +ignore = [ + "PLR0913", # Too many arguments to function call +] diff --git a/transforms/dimension-reduction-tool/run-plugin.sh b/transforms/dimension-reduction-tool/run-plugin.sh new file mode 100644 index 0000000..7eb71e5 --- /dev/null +++ b/transforms/dimension-reduction-tool/run-plugin.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +version=$( None: + """Reduce the dimensionality of the data using the specified algorithm. + + The allowed formats for the input and output data are CSV, Parquet, Feather, + and NPY. + + The allowed algorithms are PCA, t-SNE, t-SNE with PCA initialization, and UMAP. + + Args: + inp_path: The path to the input data. + out_path: The path to write the reduced data. + algorithm: The algorithm to use for dimensionality reduction. + kwargs: Additional keyword arguments for the algorithm. + """ + data = Formats.read(inp_path) + reduced_data: numpy.ndarray + + if algorithm == Algorithm.PCA: + reduced_data = pca.reduce(data, **kwargs) + elif algorithm == Algorithm.TSNE: + reduced_data = tsne.reduce(data, **kwargs) + elif algorithm == Algorithm.TSNE_INIT_PCA: + reduced_data = tsne.reduce_init_pca(data, **kwargs) + elif algorithm == Algorithm.UMAP: + reduced_data = umap.reduce(data, **kwargs) + else: + allowed_algorithms = ", ".join(Algorithm.__members__.keys()) + msg = ( + f"Unsupported algorithm: {algorithm}. Must be one of: {allowed_algorithms}" + ) + raise ValueError(msg) + + Formats.write(reduced_data, out_path) + + +__all__ = [ + "pca", + "tsne", + "umap", + "POLUS_LOG_LVL", + "POLUS_TAB_EXT", + "__version__", + "SvdSolver", + "Algorithm", + "Formats", + "reduce", +] diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py new file mode 100644 index 0000000..6c7c1b4 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/__main__.py @@ -0,0 +1,234 @@ +"""CLI for the Dimension Reduction tool.""" + +import json +import logging +import pathlib + +import filepattern +import tqdm +import typer +from polus.tabular.transforms.dimension_reduction import POLUS_LOG_LVL +from polus.tabular.transforms.dimension_reduction import POLUS_TAB_EXT +from polus.tabular.transforms.dimension_reduction import Algorithm +from polus.tabular.transforms.dimension_reduction import SvdSolver +from polus.tabular.transforms.dimension_reduction import reduce + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.tabular.transforms.dimension_reduction") +logger.setLevel(POLUS_LOG_LVL) + +app = typer.Typer() + + +@app.command() +def main( + inp_dir: pathlib.Path = typer.Option( + ..., + "--inpDir", + help="Input data that needs to be reduced in dimensionality.", + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + file_pattern: str = typer.Option( + ".*", + "--filePattern", + help="pattern to parse tabular files", + ), + algorithm: Algorithm = typer.Option( + Algorithm.UMAP, + "--algorithm", + help="The algorithm to use for dimensionality reduction", + ), + n_components: int = typer.Option( + ..., + "--nComponents", + help="The dimensionality to reduce the data to", + ), + pca_whiten: bool = typer.Option( + False, + "--pcaWhiten", + help="PCA: Whether to whiten the data", + ), + pca_svd_solver: SvdSolver = typer.Option( + SvdSolver.AUTO, + "--pcaSvdSolver", + help="PCA: The singular value decomposition solver to use", + ), + pca_tol: float = typer.Option( + 0.0, + "--pcaTol", + help='PCA: Tolerance for singular values computed by svd_solver == "arpack"', + ), + tsne_perplexity: float = typer.Option( + 30.0, + "--tsnePerplexity", + help="t-SNE: The perplexity is related to the number of nearest neighbors " + "that is used in other manifold learning algorithms. Larger datasets " + "usually require a larger perplexity. Consider selecting a value between " + "5 and 50.", + ), + tsne_early_exaggeration: float = typer.Option( + 12.0, + "--tsneEarlyExaggeration", + help="t-SNE: Controls how tight natural clusters in the original space are in " + "the embedded space and how much space will be between them. For larger " + "values, the space between natural clusters will be larger in the embedded " + "space.", + ), + tsne_learning_rate: float = typer.Option( + 200.0, + "--tsneLearningRate", + help="The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If " + "the learning rate is too high, the data may look like a 'ball' with any " + "point approximately equidistant from its nearest neighbours. If the learning " + "rate is too low, most points may look compressed in a dense cloud with few " + "outliers. If the cost function gets stuck in a bad local minimum increasing " + "the learning rate may help.", + ), + tsne_max_iter: int = typer.Option( + 1000, + "--tsneMaxIter", + help="t-SNE: Maximum number of iterations for the optimization. Should be at " + "least 250.", + ), + tsne_metric: str = typer.Option( + "euclidean", + "--tsneMetric", + help="t-SNE: The metric to use when calculating distance between " + "instances in a feature array. It must be one of the options allowed by " + "scipy.spatial.distance.pdist for its metric parameter", + ), + tsne_init_n_components: int = typer.Option( + 50, + "--tsneInitNComponents", + help="t-SNE: The number of components to reduce to with PCA before running " + "t-SNE.", + ), + umap_n_neighbors: int = typer.Option( + 15, + "--umapNNeighbors", + help="UMAP: The size of local neighborhood (in terms of number of neighboring " + "sample points) used for manifold approximation. Larger values result in more " + "global views of the manifold, while smaller values result in more local data " + "being preserved. In general, values should be in the range 2 to 100.", + ), + umap_n_epochs: int = typer.Option( + None, + "--umapNEpochs", + help="UMAP: The number of training epochs to be used in optimizing the low " + "dimensional embedding. Larger values result in more accurate embeddings. If " + "None, the value will be set automatically based on the size of the input " + "dataset (200 for large datasets, 500 for small).", + ), + umap_min_dist: float = typer.Option( + 0.1, + "--umapMinDist", + help="UMAP: The effective minimum distance between embedded points. Smaller " + "values will result in a more clustered/clumped embedding where nearby points " + "on the manifold are drawn closer together, while larger values will result " + "in a more even dispersal of points. The value should be set relative to the " + "spread value, which determines the scale at which embedded points will be " + "spread out.", + ), + umap_spread: float = typer.Option( + 1.0, + "--umapSpread", + help="UMAP: The effective scale of embedded points. In combination with " + "min_dist this determines how clustered/clumped the embedded points are.", + ), + umap_metric: str = typer.Option( + "euclidean", + "--umapMetric", + help="UMAP: The metric to use when calculating distance between " + "instances in a feature array. It must be one of the options allowed by " + "scipy.spatial.distance.pdist for its metric parameter", + ), + out_dir: pathlib.Path = typer.Option( + ..., + "--outDir", + help="Output collection", + exists=True, + file_okay=False, + dir_okay=True, + writable=True, + resolve_path=True, + ), + preview: bool = typer.Option( + False, + "--preview", + help="Output a JSON preview of outputs produced by this tool", + ), +) -> None: + """CLI for the Dimension Reduction tool.""" + logger.info(f"inpDir = {inp_dir}") + logger.info(f"filePattern = {file_pattern}") + logger.info(f"algorithm = {algorithm.value}") + logger.info(f"nComponents = {n_components}") + logger.info(f"pcaWhiten = {pca_whiten}") + logger.info(f"pcaSvdSolver = {pca_svd_solver.value}") + logger.info(f"pcaTol = {pca_tol}") + logger.info(f"tsnePerplexity = {tsne_perplexity}") + logger.info(f"tsneEarlyExaggeration = {tsne_early_exaggeration}") + logger.info(f"tsneLearningRate = {tsne_learning_rate}") + logger.info(f"tsneMaxIter = {tsne_max_iter}") + logger.info(f"tsneMetric = {tsne_metric}") + logger.info(f"tsneInitNComponents = {tsne_init_n_components}") + logger.info(f"umapNNeighbors = {umap_n_neighbors}") + logger.info(f"umapNEpochs = {umap_n_epochs}") + logger.info(f"umapMinDist = {umap_min_dist}") + logger.info(f"umapSpread = {umap_spread}") + logger.info(f"umapMetric = {umap_metric}") + logger.info(f"outDir = {out_dir}") + logger.info(f"preview = {preview}") + + kwargs = { + "n_components": n_components, + "pca_whiten": pca_whiten, + "pca_svd_solver": pca_svd_solver, + "pca_tol": pca_tol, + "tsne_perplexity": tsne_perplexity, + "tsne_early_exaggeration": tsne_early_exaggeration, + "tsne_learning_rate": tsne_learning_rate, + "tsne_max_iter": tsne_max_iter, + "tsne_metric": tsne_metric, + "tsne_init_n_components": tsne_init_n_components, + "umap_n_neighbors": umap_n_neighbors, + "umap_n_epochs": umap_n_epochs, + "umap_min_dist": umap_min_dist, + "umap_spread": umap_spread, + "umap_metric": umap_metric, + } + kwargs = algorithm.parse_kwargs(kwargs) + + fp = filepattern.FilePattern(path=inp_dir, pattern=file_pattern) + files = [p for _, [p] in fp()] + + logger.info(f"Found {len(files)} files to process.") + + path: pathlib.Path + + if preview: + out_dict: dict[str, list[str]] = {"files": []} + for path in files: + out_dict["files"].append(str(out_dir / (path.stem + POLUS_TAB_EXT))) + with (out_dir / "preview.json").open("w") as f: + json.dump(out_dict, f, indent=2) + else: + for path in tqdm.tqdm(files): + reduce( + inp_path=path, + out_path=out_dir / (path.stem + POLUS_TAB_EXT), + algorithm=algorithm, + kwargs=kwargs, + ) + + +if __name__ == "__main__": + app() diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py new file mode 100644 index 0000000..518e346 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/__init__.py @@ -0,0 +1,103 @@ +"""Dimension Reduction algorithms supported by this tool.""" + +import enum + +from . import pca +from . import tsne +from . import umap +from .pca import SvdSolver + + +class Algorithm(str, enum.Enum): + """The dimension reduction algorithms supported by this tool.""" + + PCA = "pca" + TSNE = "tsne" + TSNE_INIT_PCA = "tsne_init_pca" + UMAP = "umap" + + def parse_kwargs(self, inp_kwargs: dict) -> dict: # noqa: PLR0915, PLR0912, C901 + """Converts the inputs from the typer CLI to be used by the algorithms.""" + out_kwargs = {} + + if "n_components" in inp_kwargs: + out_kwargs["n_components"] = inp_kwargs["n_components"] + else: + msg = "n_components is a required argument." + raise ValueError(msg) + + if self == Algorithm.PCA: + expected_keys = ["whiten", "svd_solver", "tol"] + for key in expected_keys: + pca_key = f"pca_{key}" + if pca_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[pca_key] + else: + msg = f"{pca_key} is a required argument for PCA." + raise ValueError(msg) + elif self == Algorithm.TSNE: + expected_keys = [ + "perplexity", + "early_exaggeration", + "learning_rate", + "max_iter", + "metric", + ] + for key in expected_keys: + tsne_key = f"tsne_{key}" + if tsne_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[tsne_key] + else: + msg = f"{tsne_key} is a required argument for t-SNE." + raise ValueError(msg) + elif self == Algorithm.TSNE_INIT_PCA: + if "tsne_init_n_components" in inp_kwargs: + out_kwargs["pca_n_components"] = inp_kwargs["tsne_init_n_components"] + else: + msg = ( + "tsne_init_n_components is a required argument for t-SNE " + "with PCA initialization." + ) + raise ValueError(msg) + + pca_keys = ["whiten", "svd_solver", "tol"] + for key in pca_keys: + pca_key = f"pca_{key}" + if pca_key in inp_kwargs: + out_kwargs[pca_key] = inp_kwargs[pca_key] + else: + msg = f"{pca_key} is a required argument for PCA." + raise ValueError(msg) + + tsne_keys = [ + "perplexity", + "early_exaggeration", + "learning_rate", + "max_iter", + "metric", + ] + for key in tsne_keys: + tsne_key = f"tsne_{key}" + if tsne_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[tsne_key] + else: + msg = f"{tsne_key} is a required argument for t-SNE." + raise ValueError(msg) + elif self == Algorithm.UMAP: + expected_keys = ["n_neighbors", "n_epochs", "min_dist", "spread"] + for key in expected_keys: + umap_key = f"umap_{key}" + if umap_key in inp_kwargs: + out_kwargs[key] = inp_kwargs[umap_key] + else: + msg = f"{umap_key} is a required argument for UMAP." + raise ValueError(msg) + else: + allowed_algorithms = ", ".join(Algorithm.__members__.keys()) + msg = f"Unsupported algorithm: {self}. Must be one of: {allowed_algorithms}" + raise ValueError(msg) + + return out_kwargs + + +__all__ = ["pca", "tsne", "umap", "SvdSolver", "Algorithm"] diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py new file mode 100644 index 0000000..27e3407 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/pca.py @@ -0,0 +1,46 @@ +"""Dimension reduction by Principal Component Analysis (PCA).""" + +import enum + +import numpy +import sklearn.decomposition + + +class SvdSolver(str, enum.Enum): + """The singular value decomposition solver to use.""" + + AUTO = "auto" + FULL = "full" + ARPACK = "arpack" + RANDOMIZED = "randomized" + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + whiten: bool = False, + svd_solver: SvdSolver = SvdSolver.AUTO, + tol: float = 0.0, +) -> numpy.ndarray: + """Reduce the dimensionality of the data using PCA. + + Args: + data: The data to reduce. + n_components: The number of components to reduce to. + whiten: Whether to whiten the data. Defaults to False. + svd_solver: The singular value decomposition solver to use. Defaults to + "auto". + tol: Tolerance for singular values computed by svd_solver == "arpack". + Must be of range [0.0, infinity). + + Returns: + The reduced data. + """ + pca = sklearn.decomposition.PCA( + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver.value, + tol=tol, + ) + return pca.fit_transform(data) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py new file mode 100644 index 0000000..7bce097 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/tsne.py @@ -0,0 +1,104 @@ +"""Dimension reduction by t-distributed Stochastic Neighbor Embedding (t-SNE).""" + +import typing + +import numpy +import sklearn.manifold + +from . import pca + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + perplexity: float = 30.0, + early_exaggeration: float = 12.0, + learning_rate: typing.Union[float, typing.Literal["auto"]] = "auto", + max_iter: int = 1000, + metric: str = "euclidean", +) -> numpy.ndarray: + """Reduce the dimensionality of the data using t-SNE. + + Args: + data: The data to reduce. + + n_components: The number of components to reduce to. + + perplexity: The perplexity is related to the number of nearest neighbors + that is used in other manifold learning algorithms. Larger datasets + usually require a larger perplexity. The perplexity must be less + than the number of samples. + + early_exaggeration: Controls how tight natural clusters in the original + space are in the embedded space and how much space will be between them. + For larger values, the space between natural clusters will be larger in + the embedded space. + + learning_rate: The learning rate for t-SNE is usually in the range + [10.0, 1000.0]. If the learning rate is too high, the data may look like + a 'ball' with any point approximately equidistant from its nearest + neighbours. If the learning rate is too low, most points may look + compressed in a dense cloud with few outliers. If the cost function gets + stuck in a bad local minimum increasing the learning rate may help. + + max_iter: Maximum number of iterations for the optimization. Should be + at least 250. + + metric: The metric to use when calculating distance between instances in + a feature array. It must be one of the options allowed by + scipy.spatial.distance.pdist for its metric parameter, or a metric + listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + + Returns: + The reduced data. + """ + tsne = sklearn.manifold.TSNE( + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + return tsne.fit_transform(data) + + +def reduce_init_pca( + data: numpy.ndarray, + *, + pca_n_components: int, + pca_whiten: bool = False, + pca_svd_solver: pca.SvdSolver = pca.SvdSolver.AUTO, + pca_tol: float = 0.0, + n_components: int, + perplexity: float = 30.0, + early_exaggeration: float = 12.0, + learning_rate: typing.Union[float, typing.Literal["auto"]] = "auto", + max_iter: int = 1000, + metric: str = "euclidean", +) -> numpy.ndarray: + """Reduce the dimensionality of the data using PCA followed by t-SNE. + + This is useful when the data has a high number of dimensions and t-SNE + would be too slow to run directly. + + For the parameter documentation, see the `pca.reduce` and `tsne.reduce` + functions. + """ + pca_data = pca.reduce( + data, + n_components=pca_n_components, + whiten=pca_whiten, + svd_solver=pca_svd_solver, + tol=pca_tol, + ) + return reduce( + pca_data, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py new file mode 100644 index 0000000..b05a69c --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/algorithms/umap.py @@ -0,0 +1,63 @@ +"""Dimension reduction by Uniform Manifold Approximation and Projection (UMAP).""" + +import typing + +import numpy +import umap + + +def reduce( + data: numpy.ndarray, + *, + n_components: int, + n_neighbors: int = 15, + metric: str = "euclidean", + n_epochs: typing.Optional[int] = None, + min_dist: float = 0.1, + spread: float = 1.0, +) -> numpy.ndarray: + """Reduce the dimensionality of the data using UMAP. + + Args: + data: The data to reduce. + + n_components: The number of components to reduce to. + + n_neighbors: The size of local neighborhood (in terms of number of + neighboring sample points) used for manifold approximation. Larger + values result in more global views of the manifold, while smaller + values result in more local data being preserved. In general, values + should be in the range 2 to 100. + + metric: The metric to use when calculating distance between instances in + the high dimensional space. It must be one of the options allowed by + scipy.spatial.distance.pdist for its metric parameter. + + n_epochs: The number of training epochs to be used in optimizing the + low dimensional embedding. Larger values result in more accurate + embeddings. If None, the value will be set automatically based on the + size of the input dataset (200 for large datasets, 500 for small). + + min_dist: The effective minimum distance between embedded points. + Smaller values will result in a more clustered/clumped embedding where + nearby points on the manifold are drawn closer together, while larger + values will result in a more even dispersal of points. The value should + be set relative to the spread value, which determines the scale at + which embedded points will be spread out. + + spread: The effective scale of embedded points. In combination with + ``min_dist`` this determines how clustered/clumped the embedded points + are. + + Returns: + The reduced data. + """ + reducer = umap.UMAP( + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + return reducer.fit_transform(data) diff --git a/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py new file mode 100644 index 0000000..e57f6f3 --- /dev/null +++ b/transforms/dimension-reduction-tool/src/polus/tabular/transforms/dimension_reduction/data_io.py @@ -0,0 +1,62 @@ +"""Helpers for reading and writing data.""" + +import enum +import pathlib + +import numpy +import pandas + + +class Formats(str, enum.Enum): + """The data formats supported by this tool.""" + + CSV = "csv" + PARQUET = "parquet" + FEATHER = "feather" + NPY = "npy" + + @staticmethod + def read(path: pathlib.Path) -> numpy.ndarray: + """Read the data from the specified path.""" + # Read the extension of the file + ext = path.suffix + + data: numpy.ndarray + if ext == ".csv": + df = pandas.read_csv(path) + data = df.to_numpy(dtype=numpy.float32) + elif ext == ".parquet": + data = pandas.read_parquet(path).to_numpy(dtype=numpy.float32) + elif ext == ".feather": + data = pandas.read_feather(path).to_numpy(dtype=numpy.float32) + elif ext == ".npy": + data = numpy.load(path) + data = data.astype(numpy.float32) + else: + allowed_formats = ", ".join(Formats.__members__.keys()) + msg = f"Unsupported file format: {ext}. Must be one of: {allowed_formats}" + raise ValueError(msg) + + return data + + @staticmethod + def write(data: numpy.ndarray, path: pathlib.Path) -> None: + """Write the data to the specified path.""" + # Write the extension of the file + ext = path.suffix + + if ext == ".csv": + pandas.DataFrame(data).to_csv(path, index=False) + elif ext == ".parquet": + pandas.DataFrame(data).to_parquet(path, index=False) + elif ext == ".feather": + pandas.DataFrame(data).to_feather(path) + elif ext == ".npy": + numpy.save(path, data) + else: + allowed_formats = ", ".join(Formats.__members__.keys()) + msg = f"Unsupported file format: {ext}. Must be one of: {allowed_formats}" + raise ValueError(msg) + + +__all__ = ["Formats"] diff --git a/transforms/dimension-reduction-tool/tests/__init__.py b/transforms/dimension-reduction-tool/tests/__init__.py new file mode 100644 index 0000000..f2fe897 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the dimension reduction tool.""" diff --git a/transforms/dimension-reduction-tool/tests/conftest.py b/transforms/dimension-reduction-tool/tests/conftest.py new file mode 100644 index 0000000..5760cb9 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/conftest.py @@ -0,0 +1,14 @@ +"""Configuration for pytest.""" + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/transforms/dimension-reduction-tool/tests/test_cli.py b/transforms/dimension-reduction-tool/tests/test_cli.py new file mode 100644 index 0000000..476ba03 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_cli.py @@ -0,0 +1,198 @@ +"""Tests for the CLI.""" + +import copy +import pathlib +import tempfile + +import numpy +import pytest +import typer.testing +import sklearn.datasets +from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm +from polus.tabular.transforms.dimension_reduction.data_io import Formats +from polus.tabular.transforms.dimension_reduction.__main__ import app + + +ALGORITHMS = [Algorithm.TSNE_INIT_PCA, Algorithm.UMAP] +FORMATS = ["csv", "feather"] + + +def create_data(inp_format: str) -> tuple[pathlib.Path, pathlib.Path]: + """Generate data.""" + + data_dir = pathlib.Path(tempfile.mkdtemp(suffix="_data_dir")) + + inp_dir = data_dir.joinpath("inp_dir") + inp_dir.mkdir() + + out_dir = data_dir.joinpath("out_dir") + out_dir.mkdir() + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + data = data.astype(numpy.float32) + Formats.write(data, inp_dir.joinpath(f"digits.{inp_format}")) + + return inp_dir, out_dir + + +def gen_pca_args( + svd_solver: list[str] = ["auto", "arpack"], + tol: list[float] = [0.0, 0.1, 0.5, 1.0], +) -> list[dict]: + """Generate arguments for the PCA algorithm.""" + all_kwargs = [] + for s in svd_solver: + if s == "arpack": + for t in tol: + all_kwargs.append( + { + "pcaSvdSolver": s, + "pcaTol": t, + } + ) + else: + all_kwargs.append( + { + "pcaSvdSolver": s, + "pcaTol": 0.0, + } + ) + return all_kwargs + + +def gen_tsne_args( + perplexity: list[float] = [5.0, 50.0], + early_exaggeration: list[float] = [4.0, 24.0], + learning_rate: list[float] = [100.0, 200.0], + max_iter: list[int] = [250, 1000], + metric: list[str] = ["euclidean", "cosine"], +) -> list[dict]: + """Generate arguments for the t-SNE algorithm.""" + all_kwargs = [] + for p in perplexity: + for e in early_exaggeration: + for l in learning_rate: + for m in max_iter: + for me in metric: + all_kwargs.append( + { + "tsnePerplexity": p, + "tsneEarlyExaggeration": e, + "tsneLearningRate": l, + "tsneMaxIter": m, + "tsneMetric": me, + } + ) + return all_kwargs + + +def gen_tsne_pca_args( + perplexity: list[float] = [5.0, 50.0], + early_exaggeration: list[float] = [4.0, 24.0], + learning_rate: list[float] = [100.0, 200.0], + max_iter: list[int] = [250, 1000], + metric: list[str] = ["euclidean", "cosine"], + init_n_components: list[int] = [10, 50], +) -> list[dict]: + """Generate arguments for the t-SNE algorithm with PCA initialization.""" + tsne_kwargs = gen_tsne_args( + perplexity, early_exaggeration, learning_rate, max_iter, metric + ) + all_kwargs = [] + for inp_kwargs in tsne_kwargs: + for n in init_n_components: + kwargs = copy.deepcopy(inp_kwargs) + kwargs["tsneInitNComponents"] = n + all_kwargs.append(kwargs) + return all_kwargs + + +def gen_umap_args( + n_neighbors: list[int] = [5, 15, 50], + n_epochs: list[int] = [200, 500], + min_dist: list[float] = [0.1, 0.5], + spread: list[float] = [1.0, 2.0], + metric: list[str] = ["euclidean", "cosine"], +) -> list[dict]: + """Generate arguments for the UMAP algorithm.""" + all_kwargs = [] + for n in n_neighbors: + for e in n_epochs: + for m in min_dist: + for s in spread: + for me in metric: + all_kwargs.append( + { + "umapNNeighbors": n, + "umapNEpochs": e, + "umapMinDist": m, + "umapSpread": s, + "umapMetric": me, + } + ) + return all_kwargs + + +@pytest.mark.parametrize("inp_format", FORMATS) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("n_components", [3]) +def test_cli( + inp_format: str, + algorithm: Algorithm, + n_components: int, +) -> None: + """Test the CLI.""" + + inp_dir, out_dir = create_data(inp_format) + + base_kwargs = { + "inpDir": str(inp_dir), + "outDir": str(out_dir), + "nComponents": str(n_components), + "algorithm": algorithm.value, + } + all_kwargs: list[dict] = [] + if algorithm == Algorithm.PCA: + all_kwargs = gen_pca_args() + elif algorithm == Algorithm.TSNE: + all_kwargs = gen_tsne_args() + elif algorithm == Algorithm.TSNE_INIT_PCA: + all_kwargs = gen_tsne_pca_args() + elif algorithm == Algorithm.UMAP: + all_kwargs = gen_umap_args() + else: + raise ValueError(f"Unknown algorithm {algorithm}") + + for inp_kwargs in all_kwargs: + kwargs = copy.deepcopy(base_kwargs) + kwargs.update(inp_kwargs) + + args = [] + for k, v in kwargs.items(): + args.extend(["--" + k, str(v)]) + + runner = typer.testing.CliRunner() + result = runner.invoke(app, args) + + assert result.exit_code == 0, f"CLI failed with {result.stdout}\n{args}" + + inp_dir = pathlib.Path(kwargs["inpDir"]) + out_dir = pathlib.Path(kwargs["outDir"]) + inp_files: list[pathlib.Path] = [p for p in inp_dir.iterdir()] + out_files: list[pathlib.Path] = [p for p in out_dir.iterdir()] + + assert len(inp_files) == 1 + assert len(out_files) == 1 + + for inp_path in inp_files: + out_path = out_dir.joinpath(inp_path.stem + ".feather") + msg = f"Missing {inp_path.stem} from {inp_files} in {out_files}\n{args}" + assert out_path in out_files, msg + + data = Formats.read(out_path) + assert data.shape == (1797, n_components) + assert data.dtype == numpy.float32 + + for out_path in out_files: + out_path.unlink() diff --git a/transforms/dimension-reduction-tool/tests/test_fast.py b/transforms/dimension-reduction-tool/tests/test_fast.py new file mode 100644 index 0000000..fbf2464 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_fast.py @@ -0,0 +1,264 @@ +"""Fast tests for github actions.""" + +import copy +import pathlib +import tempfile + +import numpy +import pytest +import sklearn.datasets +import typer.testing +from polus.tabular.transforms.dimension_reduction.algorithms import Algorithm +from polus.tabular.transforms.dimension_reduction import algorithms +from polus.tabular.transforms.dimension_reduction.data_io import Formats +from polus.tabular.transforms.dimension_reduction.__main__ import app + + +FORMATS = ["csv", "feather", "parquet", "npy"] + + +def create_data(inp_format: str) -> tuple[pathlib.Path, pathlib.Path]: + """Generate data.""" + + data_dir = pathlib.Path(tempfile.mkdtemp(suffix="_data_dir")) + + inp_dir = data_dir.joinpath("inp_dir") + inp_dir.mkdir() + + out_dir = data_dir.joinpath("out_dir") + out_dir.mkdir() + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + data = data.astype(numpy.float32) + Formats.write(data, inp_dir.joinpath(f"digits.{inp_format}")) + + return inp_dir, out_dir + + +@pytest.mark.parametrize("inp_format", FORMATS) +@pytest.mark.parametrize("out_format", FORMATS) +def test_data_io(inp_format: str, out_format: str) -> None: + """Test data IO.""" + + inp_dir, out_dir = create_data(inp_format) + assert inp_dir.exists() + assert out_dir.exists() + + inp_files: list[pathlib.Path] = list(inp_dir.iterdir()) + + assert len(inp_files) == 1 + assert inp_files[0].name == "digits." + inp_format + + out_path = out_dir.joinpath(inp_files[0].stem + f".{out_format}") + inp_data = Formats.read(inp_dir.joinpath(inp_files[0])) + Formats.write(inp_data, out_path) + + out_files: list[pathlib.Path] = list(out_dir.iterdir()) + assert len(out_files) == 1 + assert out_files[0].name == "digits." + out_format + + out_data = Formats.read(out_path) + + assert inp_data.shape == out_data.shape + assert inp_data.dtype == out_data.dtype + numpy.testing.assert_allclose(inp_data, out_data) + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("whiten", [False]) +@pytest.mark.parametrize("svd_solver", [algorithms.SvdSolver.AUTO]) +@pytest.mark.parametrize("tol", [0.0]) +def test_pca( + n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, +): + """Test the PCA algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.pca.reduce( + data.astype(numpy.float32), + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver, + tol=tol, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == data.shape[0] + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("perplexity", [30.0]) +@pytest.mark.parametrize("early_exaggeration", [12.0]) +@pytest.mark.parametrize("learning_rate", ["auto"]) +@pytest.mark.parametrize("max_iter", [250]) +@pytest.mark.parametrize("metric", ["euclidean"]) +def test_tsne( + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce( + data.astype(numpy.float32), + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("pca_n_components", [10]) +@pytest.mark.parametrize("whiten", [False]) +@pytest.mark.parametrize("svd_solver", [algorithms.SvdSolver.AUTO]) +@pytest.mark.parametrize("tol", [0.0]) +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("perplexity", [30.0]) +@pytest.mark.parametrize("early_exaggeration", [12.0]) +@pytest.mark.parametrize("learning_rate", ["auto"]) +@pytest.mark.parametrize("max_iter", [250]) +@pytest.mark.parametrize("metric", ["euclidean"]) +def test_tsne_pca( + pca_n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm with PCA initialization.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce_init_pca( + data.astype(numpy.float32), + pca_n_components=pca_n_components, + pca_whiten=whiten, + pca_svd_solver=svd_solver, + pca_tol=tol, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.parametrize("n_components", [2]) +@pytest.mark.parametrize("n_neighbors", [15]) +@pytest.mark.parametrize("metric", ["euclidean"]) +@pytest.mark.parametrize("n_epochs", [200]) +@pytest.mark.parametrize("min_dist", [0.1]) +@pytest.mark.parametrize("spread", [1.0]) +def test_umap( + n_components: int, + n_neighbors: int, + metric: str, + n_epochs: int, + min_dist: float, + spread: float, +): + """Test the UMAP algorithm.""" + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.umap.reduce( + data.astype(numpy.float32), + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +def test_cli(): + inp_dir, out_dir = create_data("csv") + + args = [ + "--inpDir", + str(inp_dir), + "--nComponents", + "3", + "--algorithm", + "umap", + "--umapNNeighbors", + "15", + "--umapNEpochs", + "200", + "--umapMinDist", + "0.1", + "--umapSpread", + "1.0", + "--umapMetric", + "euclidean", + "--outDir", + str(out_dir), + ] + + runner = typer.testing.CliRunner() + result = runner.invoke(app, args) + + assert result.exit_code == 0 + + inp_files = list(map(pathlib.Path, inp_dir.iterdir())) + out_files = list(map(pathlib.Path, out_dir.iterdir())) + + assert len(inp_files) == 1 + assert len(out_files) == 1 + + for inp_path in inp_files: + out_path = out_dir.joinpath(inp_path.stem + ".feather") + msg = f"Missing {inp_path.stem} from {inp_files} in {out_files}\n{args}" + assert out_path in out_files, msg + + data = Formats.read(out_path) + assert data.shape == (1797, 3) + assert data.dtype == numpy.float32 diff --git a/transforms/dimension-reduction-tool/tests/test_tool.py b/transforms/dimension-reduction-tool/tests/test_tool.py new file mode 100644 index 0000000..7176e30 --- /dev/null +++ b/transforms/dimension-reduction-tool/tests/test_tool.py @@ -0,0 +1,219 @@ +"""Tests for the tools.""" + +import pytest +import numpy +import sklearn.datasets + +from polus.tabular.transforms.dimension_reduction import algorithms + +SVD_SOLVERS = [ + algorithms.SvdSolver.AUTO, + algorithms.SvdSolver.FULL, + algorithms.SvdSolver.ARPACK, + algorithms.SvdSolver.RANDOMIZED, +] + + +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("n_components", [2, 10]) +@pytest.mark.parametrize("whiten", [True, False]) +@pytest.mark.parametrize("svd_solver", SVD_SOLVERS) +@pytest.mark.parametrize("tol", [0.0, 0.5]) +def test_pca( + n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, +): + """Test the PCA algorithm.""" + if all( + ( + n_components == 2, + whiten is False, + svd_solver == algorithms.SvdSolver.AUTO, + tol == 0.0, + ) + ): + # This test has been handled in `test_fast.py` + return + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.pca.reduce( + data.astype(numpy.float32), + n_components=n_components, + whiten=whiten, + svd_solver=svd_solver, + tol=tol, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == data.shape[0] + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("n_components", [2, 3, 10]) +@pytest.mark.parametrize("perplexity", [5.0, 30.0, 50.0]) +@pytest.mark.parametrize("early_exaggeration", [5.0, 12.0, 20.0]) +@pytest.mark.parametrize("learning_rate", [50.0, 100.0, 200.0, 500.0, 1000.0, "auto"]) +@pytest.mark.parametrize("max_iter", [250, 1000]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_tsne( + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm.""" + if all( + ( + n_components == 2, + perplexity == 30.0, + early_exaggeration == 12.0, + learning_rate == "auto", + max_iter == 250, + metric == "euclidean", + ) + ): + # This test has been handled in `test_fast.py` + return + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce( + data.astype(numpy.float32), + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("pca_n_components", [10, 50]) +@pytest.mark.parametrize("whiten", [False, True]) +@pytest.mark.parametrize("svd_solver", SVD_SOLVERS) +@pytest.mark.parametrize("tol", [0.0, 0.5]) +@pytest.mark.parametrize("n_components", [2, 3]) +@pytest.mark.parametrize("perplexity", [5.0, 30.0, 50.0]) +@pytest.mark.parametrize("early_exaggeration", [5.0, 12.0, 20.0]) +@pytest.mark.parametrize("learning_rate", [50.0, 100.0, 200.0, 500.0, 1000.0, "auto"]) +@pytest.mark.parametrize("max_iter", [250, 1000]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +def test_tsne_init_pca( + pca_n_components: int, + whiten: bool, + svd_solver: algorithms.SvdSolver, + tol: float, + n_components: int, + perplexity: float, + early_exaggeration: float, + learning_rate: float, + max_iter: int, + metric: str, +): + """Test the t-SNE algorithm with PCA initialization.""" + if all( + ( + pca_n_components == 10, + n_components == 2, + perplexity == 30.0, + early_exaggeration == 12.0, + learning_rate == "auto", + max_iter == 250, + metric == "euclidean", + ) + ): + # This test has been handled in `test_fast.py` + return + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.tsne.reduce_init_pca( + data.astype(numpy.float32), + pca_n_components=pca_n_components, + pca_whiten=whiten, + pca_svd_solver=svd_solver, + pca_tol=tol, + n_components=n_components, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=learning_rate, + max_iter=max_iter, + metric=metric, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32 + + +@pytest.mark.skipif("not config.getoption('slow')") +@pytest.mark.parametrize("n_components", [2, 3, 10]) +@pytest.mark.parametrize("n_neighbors", [5, 15, 50]) +@pytest.mark.parametrize("metric", ["euclidean", "cosine"]) +@pytest.mark.parametrize("n_epochs", [None, 200, 500]) +@pytest.mark.parametrize("min_dist", [0.05, 0.1, 0.2]) +@pytest.mark.parametrize("spread", [1.0, 2.0]) +def test_umap( + n_components: int, + n_neighbors: int, + metric: str, + n_epochs: int, + min_dist: float, + spread: float, +): + """Test the UMAP algorithm.""" + if all( + ( + n_components == 2, + n_neighbors == 15, + metric == "euclidean", + n_epochs == 200, + min_dist == 0.1, + spread == 1.0, + ) + ): + # This test has been handled in `test_fast.py` + return + + digits = sklearn.datasets.load_digits() + data: numpy.ndarray = digits.data + + assert data.shape == (1797, 64) + + reduced = algorithms.umap.reduce( + data.astype(numpy.float32), + n_components=n_components, + n_neighbors=n_neighbors, + metric=metric, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + ) + + assert reduced.ndim == data.ndim + assert reduced.shape[0] == 1797 + assert reduced.shape[1] == n_components + assert reduced.dtype == numpy.float32