From 9ad3fead5f8fa39777b99f99f06ab4dc106ea2d7 Mon Sep 17 00:00:00 2001 From: Vivek Anand <37226693+Vivdaddy@users.noreply.github.com> Date: Tue, 12 Mar 2024 03:53:25 -0400 Subject: [PATCH] Synthetic Data Sampling from Manifold (#76) * Added synthetic data sampling with LinearSubspace as the first one * Added more explanation for linear subspace. Fixed the docs --- .gitignore | 1 + .vscode/settings.json | 8 - cblearn/datasets/__init__.py | 5 +- cblearn/datasets/_base.py | 66 ++++++ cblearn/datasets/_datatypes.py | 10 + cblearn/datasets/_linear_subspace.py | 194 +++++++++++++++ cblearn/datasets/_triplet_response.py | 12 +- .../datasets/tests/test_linear_subspace.py | 224 ++++++++++++++++++ docs/references/index.rst | 12 + 9 files changed, 512 insertions(+), 20 deletions(-) delete mode 100644 .vscode/settings.json create mode 100644 cblearn/datasets/_base.py create mode 100644 cblearn/datasets/_datatypes.py create mode 100644 cblearn/datasets/_linear_subspace.py create mode 100644 cblearn/datasets/tests/test_linear_subspace.py diff --git a/.gitignore b/.gitignore index 32bb9f9..4eaa37f 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ instance/ docs/_build/ docs/references/generated/ docs/generated_examples/ +docs/sg_execution_times.rst # PyBuilder target/ diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 1836391..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "python.testing.pytestArgs": [ - "cblearn", - "--remote-data" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} \ No newline at end of file diff --git a/cblearn/datasets/__init__.py b/cblearn/datasets/__init__.py index 650e279..2eef04c 100644 --- a/cblearn/datasets/__init__.py +++ b/cblearn/datasets/__init__.py @@ -14,4 +14,7 @@ from ._triplet_indices import make_all_triplet_indices from ._triplet_indices import make_random_triplet_indices from ._triplet_response import triplet_response -from ._triplet_response import noisy_triplet_response \ No newline at end of file +from ._triplet_response import noisy_triplet_response + +from ._linear_subspace import LinearSubspace +from ._base import BaseManifold \ No newline at end of file diff --git a/cblearn/datasets/_base.py b/cblearn/datasets/_base.py new file mode 100644 index 0000000..c27d34d --- /dev/null +++ b/cblearn/datasets/_base.py @@ -0,0 +1,66 @@ +from abc import ABC, abstractmethod +from copy import deepcopy + + +class BaseManifold(ABC): + """ + Base class for manifold samplers. + """ + + def __init__(self, **kwargs): + """ + Initialize the manifold + """ + self._params = {} + for key, value in kwargs.items(): + setattr(self, key, value) + + @abstractmethod + def _create_manifold(self): + """ + Create the manifold + """ + pass + + @abstractmethod + def sample_points(self, **kwargs): + """ + Sample points from the manifold + """ + pass + + @abstractmethod + def get_canonical_distance_matrix(self, **kwargs): + """ + Get the distance matrix of the points sampled + """ + pass + + def get_params(self): + """ + Get the parameters of the manifold + + Returns: + The parameters of the manifold + """ + return {attr: getattr(self, attr) for attr in dir(self) if not callable( + getattr(self, attr)) and not attr.startswith('_')} + + def set_params(self, params): + """ + Set the parameters of the manifold + + Args: + The parameters to set + """ + for attr, value in params.items(): + setattr(self, attr, value) + + def clone(self): + """ + Clone the manifold + + Returns: + A clone of the manifold + """ + return deepcopy(self) diff --git a/cblearn/datasets/_datatypes.py b/cblearn/datasets/_datatypes.py new file mode 100644 index 0000000..6666ddc --- /dev/null +++ b/cblearn/datasets/_datatypes.py @@ -0,0 +1,10 @@ +import enum + +class NoiseTarget(enum.Enum): + POINTS = 'points' + DIFFERENCES = 'differences' + + +class Distance(enum.Enum): + EUCLIDEAN = 'euclidean' + PRECOMPUTED = 'precomputed' \ No newline at end of file diff --git a/cblearn/datasets/_linear_subspace.py b/cblearn/datasets/_linear_subspace.py new file mode 100644 index 0000000..a873e02 --- /dev/null +++ b/cblearn/datasets/_linear_subspace.py @@ -0,0 +1,194 @@ +from cblearn.datasets._base import BaseManifold +import numpy as np +from scipy.stats import ortho_group +from sklearn.utils import check_random_state +from typing import Union, Dict, Callable +from scipy.spatial.distance import pdist, squareform + + +class LinearSubspace(BaseManifold): + """ + Linear Subspace + + Linear Subspace is a class for creating a hyperplane of a given + subspace dimension embedded in a higher dimensional space. It gives + a method of generating synthetic points with intrinsic + structure and dimensionality. The generated points are then meant + to be used for generating ordinal data. + + A reason for needing synthetically generated points is that it is + often difficult to comprehensively evaluate the performance of + ordinal methods on real data. Being able to modify the underlying + geometry and structure of the data allows for better + experimentation and control in evaluating ordinal methods. + + This class inherits from the BaseManifold class. This class creates + hyperplanes reproducibly using the scipy.stats.ortho_group function + for a given random state. This class can sample points from the + hyperplane using a given sampling function and add noise to the + points using a given noise function. + + .. note:: Subspace dimension must be less than or equal to space + dimension and space dimension must be greater than 1. + + Attributes: + subspace_dimension: Dimension of the subspace + space_dimension: Dimension of the space + random_state: Random state for reproducibility of the manifold + created: Flag to check if the hyperplane has been created + basis: Basis of the hyperplane + + Examples: + >>> from cblearn.datasets import LinearSubspace, make_random_triplet_indices, triplet_response + >>> # Creates a 1-dimensional hyperplane in 3-dimensional space + >>> manifold = LinearSubspace(subspace_dimension=1, space_dimension=3) + >>> # Samples 10 points from the created hyperplane + >>> points, distances = manifold.sample_points(num_points=10) + >>> print(points.shape) + (10, 3) + >>> print(distances.shape) + (10, 10) + >>> # Sampling 10 points with noise + >>> noisy_points, noisy_distances = manifold.sample_points(10, noise='normal', noise_options={'scale': 0.1}) + >>> # Responding to triplets based on distance matrix + >>> triplets = make_random_triplet_indices(n_objects=10, size=100) + >>> response = triplet_response(triplets, distances, distance='precomputed') + """ + def __init__(self, subspace_dimension: int, space_dimension: int, + random_state: Union[None, int, np.random.RandomState] = None): + """ + Initialize the manifold + + Args: + subspace_dimension: Dimension of the hyperplane + space_dimension: Dimension of the space in which the hyperplane + is embedded + random_state: The seed of the pseudo random number generator + to use when sampling. If None, the random number + generator is the RandomState instance used by + np.random. + """ + if not isinstance(subspace_dimension, int): + raise ValueError('Subspace dimension must be an integer') + if subspace_dimension < 1: + raise ValueError('Subspace dimension cannot be less than 1') + if not isinstance(space_dimension, int): + raise ValueError('Space dimension must be an integer') + if subspace_dimension > space_dimension: + raise ValueError('Subspace dimension cannot be greater than' + ' dimension') + if space_dimension <= 1: + raise ValueError('Space dimension cannot be less than 2') + self.subspace_dimension = subspace_dimension + self.space_dimension = space_dimension + random_state = check_random_state(random_state) + self.manifold_state = random_state + self.created = False + super().__init__(subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + + def _create_manifold(self): + """ Creates the hyperplane """ + # Source: + # https://stackoverflow.com/questions/69036765/sampling-random-points-from-linear-subspaces-of-a-given-radius-in-arbitary-dimen + if self.subspace_dimension == 1: + scipy_random_generator = ortho_group + scipy_random_generator.random_state = self.manifold_state + basis = scipy_random_generator.rvs(dim=self.space_dimension)[:2] + else: + scipy_random_generator = ortho_group + scipy_random_generator.random_state = self.manifold_state + basis = scipy_random_generator.rvs(dim=self.space_dimension)[ + :self.subspace_dimension] + self.basis = basis + self.created = True + + def sample_points(self, num_points: int, + sampling_function: Union[str, Callable] = 'normal', + sampling_options: Dict = {'scale': 1}, + noise: Union[None, str, Callable] = None, + noise_options: Dict = {}, + random_state: Union[None, int, np.random.RandomState] = None, + return_distances: bool = True): + """ + Sample points from the hyperplane and add noise if requested + + Args: + num_points: Number of points to sample + sampling_function: The sampling function to use. + If a string, it should be a method of + the random state object. If a callable, + it should be a function that takes a + size argument and returns a numpy array + of samples. + sampling_options: The options to pass to the sampling function. + noise: The noise function to use. If a string, it should be + a method of the random state object. If a callable, + it should be a function that takes a size argument and + returns a numpy array of samples. + noise_options: The options to pass to the noise function. + random_state: The seed of the pseudo random number generator + to use when sampling. If None, the random number + generator is the RandomState instance used by + np.random. + return_distances: Flag to return the distance matrix of + the sampled points. Defaults to True. + + Returns: + The sampled points. If return_distances is True, the distance + matrix of the sampled points (num_points, num_points) is also + returned. + """ + # Create Manifold if not already created + if not self.created: + self._create_manifold() + + # Get Noise Function + if isinstance(noise, str): + random_state = check_random_state(random_state) + noise_fun: Callable = getattr(random_state, noise) + elif callable(noise): + noise_fun = noise + + # Get Sampling Function + if isinstance(sampling_function, str): + random_state = check_random_state(random_state) + sampling_fun: Callable = getattr(random_state, sampling_function) + elif callable(sampling_function): + sampling_fun = sampling_function + + # Sample Coefficients + if self.subspace_dimension == 1: + coefficients = sampling_fun( + size=(num_points, 1), **sampling_options) + points = np.matmul(coefficients.reshape(-1, 1), + self.basis[0].reshape(1, -1)) + self.basis[1] + else: + coefficients = sampling_fun( + size=(num_points, self.subspace_dimension), + **sampling_options) + points = np.matmul(coefficients, self.basis) + + # Add noise if requested + if noise is not None: + noise = noise_fun(size=points.shape, **noise_options) + points = points + noise + + if return_distances: + return points, self.get_canonical_distance_matrix(points) + else: + return points + + def get_canonical_distance_matrix(self, points: np.ndarray): + """ + Get the distance matrix of the points sampled + + Args: + points: The points sampled from the hyperplane + + Returns: + The distance matrix of the points sampled (num_points, + num_points) + """ + return squareform(pdist(points)) diff --git a/cblearn/datasets/_triplet_response.py b/cblearn/datasets/_triplet_response.py index b237b7e..2cfd736 100644 --- a/cblearn/datasets/_triplet_response.py +++ b/cblearn/datasets/_triplet_response.py @@ -1,5 +1,4 @@ """ Function in this file judge triplets, based on ground-truth embedding and possible noise patterns. """ -import enum from typing import Dict, Callable, Optional, Union from sklearn.utils import check_random_state, check_array @@ -7,16 +6,7 @@ import numpy as np from cblearn import utils - - -class NoiseTarget(enum.Enum): - POINTS = 'points' - DIFFERENCES = 'differences' - - -class Distance(enum.Enum): - EUCLIDEAN = 'euclidean' - PRECOMPUTED = 'precomputed' +from cblearn.datasets._datatypes import NoiseTarget, Distance def noisy_triplet_response(triplets: utils.Query, embedding: np.ndarray, result_format: Optional[str] = None, diff --git a/cblearn/datasets/tests/test_linear_subspace.py b/cblearn/datasets/tests/test_linear_subspace.py new file mode 100644 index 0000000..b4af57e --- /dev/null +++ b/cblearn/datasets/tests/test_linear_subspace.py @@ -0,0 +1,224 @@ +import numpy as np +import pytest +from cblearn.datasets import LinearSubspace +from functools import partial +from scipy.spatial.distance import pdist, squareform + +# Subspace Dimension and Space Dimension tests + + +def test_subspace_dimension_1(): + manifold = LinearSubspace(subspace_dimension=1, space_dimension=3) + assert manifold.subspace_dimension == 1 + points, _ = manifold.sample_points(num_points=10) + assert points.shape == (10, 3) + assert manifold.basis.shape == (2, 3) + + +def subspace_dimension_greater_than_space_dimension(subspace_dimension, space_dimension, random_state): + _ = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=random_state) + # Add your actual test logic here + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, random_state", [ + (1, 1, 1), + (1, 0, 1), + (2, 1, 1), + (4, 3, 1), +]) +def test_subspace_dimension_greater_than_space_dimension(subspace_dimension, space_dimension, random_state): + with pytest.raises(ValueError): + subspace_dimension_greater_than_space_dimension(subspace_dimension, space_dimension, random_state) + + +def invalid_subspace_dimension(subspace_dimension): + _ = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=3) + + +@pytest.mark.parametrize("subspace_dimension", [-1, 0, "2", 1.5] + ) +def test_invalid_subspace_dimension(subspace_dimension): + with pytest.raises(ValueError): + invalid_subspace_dimension(subspace_dimension) + + +def invalid_space_dimension(space_dimension): + _ = LinearSubspace(subspace_dimension=1, space_dimension=space_dimension) + + +@pytest.mark.parametrize("space_dimension", [-1, 0, "2", 1.5] + ) +def test_invalid_space_dimension(space_dimension): + with pytest.raises(ValueError): + invalid_space_dimension(space_dimension) + +# # Noise and Sampling Function Tests + + +@pytest.mark.parametrize("subspace_dimension, space_dimension", [ + (1, 2), + (2, 2), + (4, 4), + (1, 3), + (3, 10) +]) +def test_no_random_state(subspace_dimension, space_dimension): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=None) + points, _ = manifold.sample_points(num_points=10) + assert points.shape == (10, space_dimension) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, random_state", [ + (1, 2, 1), + (2, 2, 2), + (4, 4, 10), + (1, 3, 1), + (3, 10, 1) +]) +def test_no_noise(subspace_dimension, space_dimension, random_state): + manifold1 = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + manifold1._create_manifold() + manifold2 = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + manifold2._create_manifold() + # Check that basis is the same + np.testing.assert_array_equal(manifold1.basis, manifold2.basis) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, noise", [ + (1, 2, 'normal'), + (2, 2, 'laplace'), + (4, 4, 'normal'), + (1, 3, 'gumbel'), + (3, 10, 'normal') +]) +def test_noise_string(subspace_dimension, space_dimension, noise): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=1) + points = manifold.sample_points(num_points=100, noise=noise) + assert points[0].shape == (100, space_dimension) + + +def randint_wrapper(low, high, size): + return np.random.randint(low=low, high=high, size=size) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, noise", [ + (1, 2, np.random.normal), + (2, 2, partial(randint_wrapper, low=1, high=10)), + (4, 4, np.random.normal), + (1, 3, partial(randint_wrapper, low=1, high=10)), + (3, 10, np.random.gumbel) +]) +def test_noise_callable(subspace_dimension, space_dimension, noise): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=1) + points = manifold.sample_points(num_points=100, noise=noise) + assert points[0].shape == (100, space_dimension) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, sampling_function, random_state", [ + (1, 2, 'normal', 1), + (2, 2, 'normal', 1), + (4, 4, 'normal', 1), + (1, 3, 'normal', 1), + (3, 10, 'normal', 1) +]) +def test_sampling_function(subspace_dimension, space_dimension, sampling_function, random_state): + manifold1 = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + points1 = manifold1.sample_points(num_points=100, sampling_function=sampling_function, random_state=random_state) + assert points1[0].shape == (100, space_dimension) + manifold2 = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + points2 = manifold2.sample_points(num_points=100, sampling_function=sampling_function, random_state=random_state) + np.testing.assert_array_equal(points1[0], points2[0]) + +# # Test distances or not + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, return_distances", [ + (1, 2, False), + (2, 2, False), + (4, 4, False), + (1, 3, False), + (3, 10, False) +]) +def test_sample_points_return_no_distances(subspace_dimension, space_dimension, return_distances): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=1) + points = manifold.sample_points(num_points=100, return_distances=return_distances) + assert points.shape == (100, space_dimension) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, return_distances", [ + (1, 2, True), + (2, 2, True), + (4, 4, True), + (1, 3, True), + (3, 10, True) +]) +def test_sample_points_return_distances(subspace_dimension, space_dimension, return_distances): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=1) + points, distances = manifold.sample_points(num_points=100, return_distances=return_distances) + assert points.shape == (100, space_dimension) + assert distances.shape == (100, 100) + assert np.allclose(distances, squareform(pdist(points))) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, return_distances, noise", [ + (1, 2, True, 'normal'), + (2, 2, True, 'normal'), + (4, 4, True, 'normal'), + (1, 3, True, 'normal'), + (3, 10, True, 'normal') +]) +def test_canonical_distance_matrix(subspace_dimension, space_dimension, return_distances, noise): + manifold = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=1) + points, distances = manifold.sample_points(num_points=100, return_distances=return_distances, noise=noise) + assert np.allclose(distances, manifold.get_canonical_distance_matrix(points)) + + +# # Test saving, loading and cloning + +@pytest.mark.parametrize("subspace_dimension, space_dimension, random_state", [ + (1, 2, 1), + (2, 2, 2), + (4, 4, 10), + (1, 3, 1), + (3, 10, 1) +]) +def test_set_get_params(subspace_dimension, space_dimension, random_state): + manifold1 = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + manifold1._create_manifold() + manifold2 = LinearSubspace(subspace_dimension=subspace_dimension, space_dimension=space_dimension, random_state=3) + manifold2._create_manifold() + params = manifold1.get_params() + manifold2.set_params(params) + np.testing.assert_equal(manifold1.basis, manifold2.basis) + + +@pytest.mark.parametrize("subspace_dimension, space_dimension, random_state", [ + (1, 2, 1), + (2, 2, 2), + (4, 4, 10), + (1, 3, 1), + (3, 10, 1) +]) +def test_clone(subspace_dimension, space_dimension, random_state): + manifold = LinearSubspace( + subspace_dimension=subspace_dimension, + space_dimension=space_dimension, + random_state=random_state) + manifold._create_manifold() + manifold_clone = manifold.clone() + np.testing.assert_equal(manifold.basis, manifold_clone.basis) diff --git a/docs/references/index.rst b/docs/references/index.rst index cb691b5..2f13459 100644 --- a/docs/references/index.rst +++ b/docs/references/index.rst @@ -31,6 +31,17 @@ Loaders datasets.fetch_similarity_matrix +Synthetic Point Generation +-------------------------- + +.. currentmodule:: cblearn + +.. autosummary:: + :toctree: generated/ + + datasets.LinearSubspace + + Simulations ----------- @@ -89,6 +100,7 @@ Utility .. _references_embedding_wrapper: + Wrapper -------