Skip to content

Commit

Permalink
Synthetic Data Sampling from Manifold (cblearn#76)
Browse files Browse the repository at this point in the history
* Added synthetic data sampling with LinearSubspace as the first one
* Added more explanation for linear subspace. Fixed the docs
  • Loading branch information
vivek2000anand authored Mar 12, 2024
1 parent 780f6fe commit 9ad3fea
Show file tree
Hide file tree
Showing 9 changed files with 512 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ instance/
docs/_build/
docs/references/generated/
docs/generated_examples/
docs/sg_execution_times.rst

# PyBuilder
target/
Expand Down
8 changes: 0 additions & 8 deletions .vscode/settings.json

This file was deleted.

5 changes: 4 additions & 1 deletion cblearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@
from ._triplet_indices import make_all_triplet_indices
from ._triplet_indices import make_random_triplet_indices
from ._triplet_response import triplet_response
from ._triplet_response import noisy_triplet_response
from ._triplet_response import noisy_triplet_response

from ._linear_subspace import LinearSubspace
from ._base import BaseManifold
66 changes: 66 additions & 0 deletions cblearn/datasets/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from abc import ABC, abstractmethod
from copy import deepcopy


class BaseManifold(ABC):
"""
Base class for manifold samplers.
"""

def __init__(self, **kwargs):
"""
Initialize the manifold
"""
self._params = {}
for key, value in kwargs.items():
setattr(self, key, value)

@abstractmethod
def _create_manifold(self):
"""
Create the manifold
"""
pass

@abstractmethod
def sample_points(self, **kwargs):
"""
Sample points from the manifold
"""
pass

@abstractmethod
def get_canonical_distance_matrix(self, **kwargs):
"""
Get the distance matrix of the points sampled
"""
pass

def get_params(self):
"""
Get the parameters of the manifold
Returns:
The parameters of the manifold
"""
return {attr: getattr(self, attr) for attr in dir(self) if not callable(
getattr(self, attr)) and not attr.startswith('_')}

def set_params(self, params):
"""
Set the parameters of the manifold
Args:
The parameters to set
"""
for attr, value in params.items():
setattr(self, attr, value)

def clone(self):
"""
Clone the manifold
Returns:
A clone of the manifold
"""
return deepcopy(self)
10 changes: 10 additions & 0 deletions cblearn/datasets/_datatypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import enum

class NoiseTarget(enum.Enum):
POINTS = 'points'
DIFFERENCES = 'differences'


class Distance(enum.Enum):
EUCLIDEAN = 'euclidean'
PRECOMPUTED = 'precomputed'
194 changes: 194 additions & 0 deletions cblearn/datasets/_linear_subspace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
from cblearn.datasets._base import BaseManifold
import numpy as np
from scipy.stats import ortho_group
from sklearn.utils import check_random_state
from typing import Union, Dict, Callable
from scipy.spatial.distance import pdist, squareform


class LinearSubspace(BaseManifold):
"""
Linear Subspace
Linear Subspace is a class for creating a hyperplane of a given
subspace dimension embedded in a higher dimensional space. It gives
a method of generating synthetic points with intrinsic
structure and dimensionality. The generated points are then meant
to be used for generating ordinal data.
A reason for needing synthetically generated points is that it is
often difficult to comprehensively evaluate the performance of
ordinal methods on real data. Being able to modify the underlying
geometry and structure of the data allows for better
experimentation and control in evaluating ordinal methods.
This class inherits from the BaseManifold class. This class creates
hyperplanes reproducibly using the scipy.stats.ortho_group function
for a given random state. This class can sample points from the
hyperplane using a given sampling function and add noise to the
points using a given noise function.
.. note:: Subspace dimension must be less than or equal to space
dimension and space dimension must be greater than 1.
Attributes:
subspace_dimension: Dimension of the subspace
space_dimension: Dimension of the space
random_state: Random state for reproducibility of the manifold
created: Flag to check if the hyperplane has been created
basis: Basis of the hyperplane
Examples:
>>> from cblearn.datasets import LinearSubspace, make_random_triplet_indices, triplet_response
>>> # Creates a 1-dimensional hyperplane in 3-dimensional space
>>> manifold = LinearSubspace(subspace_dimension=1, space_dimension=3)
>>> # Samples 10 points from the created hyperplane
>>> points, distances = manifold.sample_points(num_points=10)
>>> print(points.shape)
(10, 3)
>>> print(distances.shape)
(10, 10)
>>> # Sampling 10 points with noise
>>> noisy_points, noisy_distances = manifold.sample_points(10, noise='normal', noise_options={'scale': 0.1})
>>> # Responding to triplets based on distance matrix
>>> triplets = make_random_triplet_indices(n_objects=10, size=100)
>>> response = triplet_response(triplets, distances, distance='precomputed')
"""
def __init__(self, subspace_dimension: int, space_dimension: int,
random_state: Union[None, int, np.random.RandomState] = None):
"""
Initialize the manifold
Args:
subspace_dimension: Dimension of the hyperplane
space_dimension: Dimension of the space in which the hyperplane
is embedded
random_state: The seed of the pseudo random number generator
to use when sampling. If None, the random number
generator is the RandomState instance used by
np.random.
"""
if not isinstance(subspace_dimension, int):
raise ValueError('Subspace dimension must be an integer')
if subspace_dimension < 1:
raise ValueError('Subspace dimension cannot be less than 1')
if not isinstance(space_dimension, int):
raise ValueError('Space dimension must be an integer')
if subspace_dimension > space_dimension:
raise ValueError('Subspace dimension cannot be greater than'
' dimension')
if space_dimension <= 1:
raise ValueError('Space dimension cannot be less than 2')
self.subspace_dimension = subspace_dimension
self.space_dimension = space_dimension
random_state = check_random_state(random_state)
self.manifold_state = random_state
self.created = False
super().__init__(subspace_dimension=subspace_dimension,
space_dimension=space_dimension,
random_state=random_state)

def _create_manifold(self):
""" Creates the hyperplane """
# Source:
# https://stackoverflow.com/questions/69036765/sampling-random-points-from-linear-subspaces-of-a-given-radius-in-arbitary-dimen
if self.subspace_dimension == 1:
scipy_random_generator = ortho_group
scipy_random_generator.random_state = self.manifold_state
basis = scipy_random_generator.rvs(dim=self.space_dimension)[:2]
else:
scipy_random_generator = ortho_group
scipy_random_generator.random_state = self.manifold_state
basis = scipy_random_generator.rvs(dim=self.space_dimension)[
:self.subspace_dimension]
self.basis = basis
self.created = True

def sample_points(self, num_points: int,
sampling_function: Union[str, Callable] = 'normal',
sampling_options: Dict = {'scale': 1},
noise: Union[None, str, Callable] = None,
noise_options: Dict = {},
random_state: Union[None, int, np.random.RandomState] = None,
return_distances: bool = True):
"""
Sample points from the hyperplane and add noise if requested
Args:
num_points: Number of points to sample
sampling_function: The sampling function to use.
If a string, it should be a method of
the random state object. If a callable,
it should be a function that takes a
size argument and returns a numpy array
of samples.
sampling_options: The options to pass to the sampling function.
noise: The noise function to use. If a string, it should be
a method of the random state object. If a callable,
it should be a function that takes a size argument and
returns a numpy array of samples.
noise_options: The options to pass to the noise function.
random_state: The seed of the pseudo random number generator
to use when sampling. If None, the random number
generator is the RandomState instance used by
np.random.
return_distances: Flag to return the distance matrix of
the sampled points. Defaults to True.
Returns:
The sampled points. If return_distances is True, the distance
matrix of the sampled points (num_points, num_points) is also
returned.
"""
# Create Manifold if not already created
if not self.created:
self._create_manifold()

# Get Noise Function
if isinstance(noise, str):
random_state = check_random_state(random_state)
noise_fun: Callable = getattr(random_state, noise)
elif callable(noise):
noise_fun = noise

# Get Sampling Function
if isinstance(sampling_function, str):
random_state = check_random_state(random_state)
sampling_fun: Callable = getattr(random_state, sampling_function)
elif callable(sampling_function):
sampling_fun = sampling_function

# Sample Coefficients
if self.subspace_dimension == 1:
coefficients = sampling_fun(
size=(num_points, 1), **sampling_options)
points = np.matmul(coefficients.reshape(-1, 1),
self.basis[0].reshape(1, -1)) + self.basis[1]
else:
coefficients = sampling_fun(
size=(num_points, self.subspace_dimension),
**sampling_options)
points = np.matmul(coefficients, self.basis)

# Add noise if requested
if noise is not None:
noise = noise_fun(size=points.shape, **noise_options)
points = points + noise

if return_distances:
return points, self.get_canonical_distance_matrix(points)
else:
return points

def get_canonical_distance_matrix(self, points: np.ndarray):
"""
Get the distance matrix of the points sampled
Args:
points: The points sampled from the hyperplane
Returns:
The distance matrix of the points sampled (num_points,
num_points)
"""
return squareform(pdist(points))
12 changes: 1 addition & 11 deletions cblearn/datasets/_triplet_response.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
""" Function in this file judge triplets, based on ground-truth embedding and possible noise patterns. """
import enum
from typing import Dict, Callable, Optional, Union

from sklearn.utils import check_random_state, check_array
from sklearn.metrics import pairwise
import numpy as np

from cblearn import utils


class NoiseTarget(enum.Enum):
POINTS = 'points'
DIFFERENCES = 'differences'


class Distance(enum.Enum):
EUCLIDEAN = 'euclidean'
PRECOMPUTED = 'precomputed'
from cblearn.datasets._datatypes import NoiseTarget, Distance


def noisy_triplet_response(triplets: utils.Query, embedding: np.ndarray, result_format: Optional[str] = None,
Expand Down
Loading

0 comments on commit 9ad3fea

Please sign in to comment.