Skip to content

Commit

Permalink
test: more robustness of tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Jul 24, 2024
1 parent 4306884 commit 55dc9e3
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pre-commit = "^3.0.4"
pytest = "^7.2.1"
pytest-sugar = "^1.0.0"
pytest-xdist = "^3.6.1"
scikit-learn = "^1.5.1"

[build-system]
requires = ["poetry-core"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ def fnn(
Returns:
The FNN metric.
"""
original_knn = knn_search(
_, original_knn = knn_search(
data=original_data,
queries=original_data[query_indices],
k=n_neighbors,
metric=distance_metric,
)

embedded_knn = knn_search(
_, embedded_knn = knn_search(
data=embedded_data,
queries=embedded_data[query_indices],
k=n_neighbors,
Expand All @@ -61,7 +61,7 @@ def knn_search(
queries: numpy.ndarray,
k: int,
metric: str,
) -> numpy.ndarray:
) -> tuple[numpy.ndarray, numpy.ndarray]:
"""Find the nearest neighbors of the queries in the data.
Args:
Expand All @@ -71,8 +71,11 @@ def knn_search(
metric: The distance metric to use.
Returns:
The indices of the nearest neighbors.
The distances and indices of the nearest neighbors.
"""
distances = scipy.spatial.distance.cdist(queries, data, metric)
sorted_indices = numpy.argsort(distances, axis=1)
return sorted_indices[:, :k]

k_indices = sorted_indices[:, :k]
k_distances = numpy.take_along_axis(distances, k_indices, axis=1)
return k_distances, k_indices
136 changes: 136 additions & 0 deletions features/dimension-reduction-quality-metrics-tool/tests/test_fnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Tests for the knn-search module."""

import numpy
import pytest
import sklearn.datasets

from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import fnn
from polus.tabular.features.dimension_reduction_quality_metrics.metrics.fnn import (
knn_search,
)
from polus.tabular.transforms.dimension_reduction.algorithms import umap


def test_knn_search():
"""Tests for knn-search."""

data = numpy.asarray(
[[i, i, i] for i in range(10)],
dtype=numpy.float32,
)
queries = data[:2, :]

assert data.shape[1] == queries.shape[1]

k = 2
metric = "euclidean"
dists, indices = knn_search(data, queries, k, metric)

assert dists.shape == (queries.shape[0], k)
assert indices.shape == (queries.shape[0], k)

expected_dists = numpy.sqrt(
numpy.asarray(
[[0.0, 3.0], [0.0, 3.0]],
dtype=numpy.float32,
)
)
numpy.testing.assert_allclose(dists, expected_dists)

expected_indices = numpy.asarray(
[[0, 1], [1, 0]],
dtype=numpy.int32,
)
numpy.testing.assert_array_equal(indices, expected_indices)


def gen_data(metric: str) -> tuple[numpy.ndarray, numpy.ndarray]:
digits = sklearn.datasets.load_digits()
original_data: numpy.ndarray = digits.data
embedded_data = umap.reduce(
data=original_data,
n_components=3,
n_neighbors=15,
metric=metric,
)
return original_data, embedded_data


@pytest.mark.parametrize("metric", ["euclidean", "cosine"])
def test_fnn(metric: str):
"""Tests for False Nearest Neighbors (FNN)."""

original_data, embedded_data = gen_data(metric)
for num_queries in [10, 100, 200]:
rng = numpy.random.default_rng()
query_indices = rng.choice(
original_data.shape[0],
size=num_queries,
replace=False,
)
for k in [10, 100]:
fnn_metric = fnn(
original_data=original_data,
embedded_data=embedded_data,
query_indices=query_indices,
n_neighbors=k,
distance_metric=metric,
)

msg = f"metric: {metric}, k: {k}, num_queries: {num_queries}"
assert 0.0 <= fnn_metric <= 1.0, f"FNN: {fnn_metric:.6f}, {msg}"
expected_fnn = expected_failure_threshold(
num_queries=num_queries,
k=k,
metric=metric,
)
assert (
fnn_metric >= expected_fnn
), f"FNN: {fnn_metric:.6f} < {expected_fnn:.6f}, {msg}"


def expected_failure_threshold(
num_queries: int,
k: int,
metric: str,
) -> float:
threshold = None

# These thresholds are based on the averages of several measurements
if metric == "euclidean":
if k == 10:
if num_queries == 10:
threshold = 0.49
elif num_queries == 100:
threshold = 0.60
elif num_queries == 200:
threshold = 0.59
elif k == 100:
if num_queries == 10:
threshold = 0.58
elif num_queries == 100:
threshold = 0.65
elif num_queries == 200:
threshold = 0.67
elif metric == "cosine":
if k == 10:
if num_queries == 10:
threshold = 0.44
elif num_queries == 100:
threshold = 0.45
elif num_queries == 200:
threshold = 0.50
elif k == 100:
if num_queries == 10:
threshold = 0.56
elif num_queries == 100:
threshold = 0.65
elif num_queries == 200:
threshold = 0.65

if threshold is None:
threshold = 0.0 # If the parameters are not in the table, return 0.0
else:
threshold -= 0.1 # This gives us more leeway to pass the tests

return threshold
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Dimension Reduction algorithms supported by this tool."""

import enum
import typing

from . import pca
from . import tsne
Expand Down

0 comments on commit 55dc9e3

Please sign in to comment.