Skip to content

Commit

Permalink
Add inverse transform method that uses Numba instead of NumPy
Browse files Browse the repository at this point in the history
  • Loading branch information
darek74123 committed Apr 10, 2024
1 parent 989e39a commit 836485e
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 0 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
dataclasses
numpy~=1.26.0
numba~=0.59.0
pandas~=1.5
scikit-learn~=1.3.0
matplotlib~=3.8.0
Expand Down
64 changes: 64 additions & 0 deletions src/WMSDTransformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
from abc import ABC, abstractmethod
import numba
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
Expand Down Expand Up @@ -199,6 +200,8 @@ def inverse_transform_numpy(self, target_mean, target_std, std_type='==', sampli
for high-dimensional datasets. Therefore, this method samples the US space to identify a set of points
that are sufficiently close (within distance of `epsilon` or less) to the expected values of `target_mean`
and `target_std`.
This method utilizes NumPy vectorization to enhance performance. However, it requires storing all samples in RAM,
which may not be possible for a large number of dimensions and a high value of the `sampling_density` parameter.
Parameters
----------
Expand Down Expand Up @@ -276,6 +279,67 @@ def inverse_transform_numpy(self, target_mean, target_std, std_type='==', sampli

return solutions

def inverse_transform_numba(self, target_mean, target_std, std_type='==', sampling_density=None, epsilon=0.01, verbose=False):
"""
Find possible performance vectors (i.e., vectors of artificial alternatives' evaluations) in US space
for which the weighted mean (WM) and weight-scaled standard deviation (WSD) are close to the expected values.
The number of feasible solutions grows exponentially with the dimensionality (number of criteria) of
the dataset. Computing the exact values of all solutions is computationally expensive, particularly
for high-dimensional datasets. Therefore, this method samples the US space to identify a set of points
that are sufficiently close (within distance of `epsilon` or less) to the expected values of `target_mean`
and `target_std`.
This method utilizes a just-in-time compiler Numba to enhance performance without requiring to store all samples in RAM.
Parameters
----------
target_mean : float
The expected value of the weighted mean score for the returned solutions (performance vectors).
target_std : float
The expected value of the weight-scaled standard deviation for the returned solutions (performance vectors).
std_type : str, default='=='
The nature WSD criterion varies depending on the aggregation function used and the WM. It might be
considered as a gain-type or a cost-type criterion. By default, the method assumes that the WSD should
be as close as possible to `target_std' ('=='). The value '<=' means that the WSD is a cost-type criterion,
and therefore solutions that do not exceed `target_std` will be returned (larger deviations in the other
direction, i.e. towards smallerWSD values, are acceptable). The symbol '>=' indicates that WSD is a gain-type
criterion. Therefore, the returned solutions will exceed `target_std` (larger deviations in the other direction,
i.e. towards larger WSD values, are acceptable).
Must be one of following strings '==', '<=', '>='.
sampling_density : int or None, default=None
The `sampling_density` parameter determines how densely the Utility Space is sampled.
By default, i.e., when the `sampling_density=None`, the value of `sampling_density`
is calculated based on the dimensionality of the dataset.
epsilon : float, default=0.01
Maximum deviation of WM and WSD (when `std_type='==') or WM (otherwise) from target values.
Must be in range (0.0, 1.0].
verbose : bool, default=False
When the value of this parameter is set to True, the method provides information about the total
number of sampled solutions, RAM consumption, and the number of returned solutions.
Returns
-------
solutions: DataFrame or None
The method returns a DataFrame containing performance vectors that meet the requirements specified by
`target_mean`, `target_std`, `std_type` and `epsilon`, or None if no points satisfying these requirements
are found. In the latter scenario, it may be helpful to increase the value of `epsilon` at the expense of
lower accuracy.
"""

from utils.numba_inverse_transform import inverse_transform
filtered_points = inverse_transform(target_mean, target_std, self.weights, std_type, sampling_density, epsilon, verbose)

if len(filtered_points) == 0:
solutions = None
else:
solutions = pd.DataFrame(filtered_points, columns=self.X.columns)

return solutions

def plot(self, heatmap_quality=500, show_names=False, plot_name=None, color='jet'):

"""Plots positions of alternatives in WMSD space.
Expand Down
72 changes: 72 additions & 0 deletions src/utils/numba_inverse_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import time
import numpy as np
import math
import numba


@numba.jit(nopython=True)
def transform_US_to_wmsd_numba(X_US, weights):
# transform data from Utility Space to WMSD Space
w = weights
squared_w = w ** 2
sum_of_squared_weights = np.sum(squared_w)
norm_w = np.sqrt(np.sum(squared_w))
mean_weight = np.mean(w)

s = norm_w / mean_weight
v = X_US * w

vw = np.sum(v * w) / sum_of_squared_weights * w
w_mean = np.sqrt(np.sum(vw ** 2)) / s
w_std = np.sqrt(np.sum((v - vw) ** 2)) / s
return w_mean, w_std


@numba.jit(nopython=True)
def inverse_transform(target_mean, target_std, weights, std_type='==', sampling_density=None, epsilon=0.01, verbose=False):
n_criteria = len(weights)
if sampling_density is None:
sampling_density = math.ceil(5000000 ** (1 / n_criteria))
sampling_density = int(sampling_density)

dims = [np.linspace(0, 1, sampling_density).astype(np.float32) for i in range(n_criteria)] # the numba version of np.linspace accepts no dtype argument

divs = []
mods = []
factor = 1
for i in range((n_criteria - 1), -1, -1):
items = len(dims[i])
divs.insert(0, factor)
mods.insert(0, items)
factor *= items

n_samples = 1
for dim in dims:
n_samples *= len(dim)
if verbose:
print(f"inverse_transform_numba: sampling_density: {sampling_density}")
print(f"inverse_transform_numba: {n_samples} samples generated in total")

filtered_points = []
for i in range(0, n_samples):
point = []
for j in range(0, n_criteria):
point.append(dims[j][i // divs[j] % mods[j]])
point = np.array(point)
wm, wsd = transform_US_to_wmsd_numba(point, weights)

if std_type == "==":
if abs(wm - target_mean) < epsilon and abs(wsd - target_std) < epsilon:
filtered_points.append(point)
elif std_type == "<=":
if abs(wm - target_mean) < epsilon and wsd <= target_std:
filtered_points.append(point)
else: # std_type == ">="
if abs(wm - target_mean) < epsilon and wsd >= target_std:
filtered_points.append(point)

print(f"znaleziono {len(filtered_points)} punktów")
if verbose:
print(f"inverse_transform_numba: Returning {len(filtered_points)} solutions")

return filtered_points

0 comments on commit 836485e

Please sign in to comment.