Skip to content

Commit

Permalink
update DatasetModelsFitnessScaler to support different dataset types
Browse files Browse the repository at this point in the history
  • Loading branch information
MorrisNein committed Dec 13, 2023
1 parent ca4c067 commit ab225b0
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 17 deletions.
7 changes: 4 additions & 3 deletions meta_automl/data_preparation/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from pathlib import Path
from typing import Optional

from meta_automl.data_preparation.dataset import DatasetBase, DatasetData
from meta_automl.data_preparation.dataset import DatasetBase
from meta_automl.data_preparation.dataset.dataset_base import DatasetDataType_co


class DataNotFoundError(FileNotFoundError):
Expand All @@ -13,15 +14,15 @@ class DataNotFoundError(FileNotFoundError):

class CustomDataset(DatasetBase):

def get_data(self, cache_path: Optional[Path] = None) -> DatasetData:
def get_data(self, cache_path: Optional[Path] = None) -> DatasetDataType_co:
cache_path = cache_path or self.cache_path
if not cache_path.exists():
raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".')
with open(cache_path, 'rb') as f:
dataset_data = pickle.load(f)
return dataset_data

def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset:
def dump_data(self, dataset_data: DatasetDataType_co, cache_path: Optional[Path] = None) -> CustomDataset:
cache_path = cache_path or self.cache_path
with open(cache_path, 'wb') as f:
pickle.dump(dataset_data, f)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing_extensions import Self

from meta_automl.data_preparation.dataset import DatasetIDType
from meta_automl.data_preparation.dataset.dataset_base import DatasetType_co
from meta_automl.data_preparation.evaluated_model import EvaluatedModel


Expand All @@ -13,18 +14,20 @@ def __init__(self, scaler_class=MinMaxScaler):
self.scaler_class = scaler_class
self.scalers: Dict[DatasetIDType, Any] = {}

def fit(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]) -> Self:
for dataset_id, dataset_models in zip(dataset_ids, models):
def fit(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]) -> Self:
dataset_representations = map(repr, datasets)
for dataset_repr, dataset_models in zip(dataset_representations, models):
scaler = self.scaler_class()
self.scalers[dataset_id] = scaler
self.scalers[dataset_repr] = scaler
fitness_values_array = [model.fitness.values for model in dataset_models]
scaler.fit(fitness_values_array)
return self

def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]):
def transform(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]):
new_models = [[copy(model) for model in dataset_models] for dataset_models in models]
for dataset_id, dataset_models in zip(dataset_ids, new_models):
scaler = self.scalers[dataset_id]
dataset_representations = map(repr, datasets)
for dataset_repr, dataset_models in zip(dataset_representations, new_models):
scaler = self.scalers[dataset_repr]
fitness_values_array = [model.fitness.values for model in dataset_models]
fitness_values_array = scaler.transform(fitness_values_array)
for model, fitness_values in zip(dataset_models, fitness_values_array):
Expand All @@ -34,7 +37,7 @@ def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Seque
return new_models

def fit_transform(self,
dataset_ids: Sequence[DatasetIDType],
models: Sequence[Sequence[EvaluatedModel]]) -> Sequence[Sequence[EvaluatedModel]]:
self.fit(dataset_ids, models)
return self.transform(dataset_ids, models)
models: Sequence[Sequence[EvaluatedModel]],
datasets: Sequence[DatasetType_co]) -> Sequence[Sequence[EvaluatedModel]]:
self.fit(models, datasets)
return self.transform(models, datasets)
7 changes: 3 additions & 4 deletions test/unit/test_model_fitness_scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@ def test_dataset_models_scaler():
fitness_metric_name = ['a', 'b', 'c']
predictor = Pipeline(PipelineNode('rf'))
datasets = [CustomDataset(i) for i in range(3)]
dataset_ids = [dataset.id for dataset in datasets]
n_models = 5
models = [[EvaluatedModel(
predictor=predictor, dataset=dataset,
fitness=SingleObjFitness(dataset.id + i + 1, dataset.id + i + 2, dataset.id + i + 3),
fitness_metric_name=fitness_metric_name) for i in range(n_models)] for dataset in datasets]
scaler = DatasetModelsFitnessScaler().fit(dataset_ids, models)
new_models_1 = scaler.transform(dataset_ids, models)
new_models_2 = scaler.fit_transform(dataset_ids, models)
scaler = DatasetModelsFitnessScaler().fit(models, datasets)
new_models_1 = scaler.transform(models, datasets)
new_models_2 = scaler.fit_transform(models, datasets)

assert np.array(new_models_1).shape == np.array(new_models_2).shape

Expand Down

0 comments on commit ab225b0

Please sign in to comment.