Skip to content

Commit

Permalink
Merge pull request #247 from usc-isi-i2/jun_test_v2019
Browse files Browse the repository at this point in the history
fix clustering problem and Splitter temporary fix
  • Loading branch information
kyao authored Mar 4, 2019
2 parents 5d63dfa + 68c7e39 commit 42e8b5f
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 19 deletions.
6 changes: 6 additions & 0 deletions python/dsbox/controller/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,11 @@ def remove_empty_targets(self, dataset: Dataset) -> Dataset:
will automatically remove empty targets in training
"""
problem = self.config.problem_metadata.query(())

# do not remove columns for cluster dataset!
if problem['about']['taskType'] == "clustering":
return dataset

targets = problem["inputs"]["data"][0]["targets"]
resID = targets[0]["resID"]
colIndex = targets[0]["colIndex"]
Expand Down Expand Up @@ -1212,6 +1217,7 @@ def generate_dataset_splits(self):
from dsbox.datapreprocessing.cleaner.splitter import Splitter, SplitterHyperparameter

hyper_sampler = SplitterHyperparameter.defaults()
hyper_sampler = hyper_sampler.replace({"threshold_column_length":100000,"further_reduce_threshold_column_length":100000})
sampler = Splitter(hyperparams = hyper_sampler)
sampler.set_training_data(inputs = self.all_dataset)
sampler.fit()
Expand Down
51 changes: 32 additions & 19 deletions python/dsbox/template/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -2773,20 +2773,21 @@ def importance(dataset, problem_description):
class CMUClusteringTemplate(DSBoxTemplate):
def __init__(self):
DSBoxTemplate.__init__(self)
self.need_add_reference = True
self.template = {
"name": "CMU_Clustering_Template",
"taskType": TaskType.CLUSTERING.name,
"taskSubtype": "NONE",
"inputType": "table",
"output": "model_step",
"output": "output_step",
"steps": [
# {
# "name": "denormalize_step",
# "primitives": ["d3m.primitives.data_transformation.denormalize.Common"],
# "inputs": ["template_input"]
# },
{
"name": "to_dataframe_step",
"name": "to_dataframe_step", # step 0
"primitives": ["d3m.primitives.data_transformation.dataset_to_dataframe.Common"],
"inputs": ["template_input"]
},
Expand All @@ -2796,29 +2797,28 @@ def __init__(self):
# "inputs": ["to_dataframe_step"]
# },
{
"name": "column_parser_step",
"primitives": ["d3m.primitives.data_transformation.ToNumeric.DSBOX"],
"name": "column_parser_step",# step 1
"primitives": ["d3m.primitives.data_transformation.column_parser.DataFrameCommon"],
"inputs":["to_dataframe_step"],
},

{
"name": "extract_attribute_step",
"primitives": [{
"primitive": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon",
"hyperparameters":
{
'semantic_types': (
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
'https://metadata.datadrivendiscovery.org/types/Attribute',),
'use_columns': (),
'exclude_columns': ()
}
}],
"name": "extract_attribute_step", # step 2
"primitives": ["d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon",
# "hyperparameters":
# {
# 'semantic_types': (
# 'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
# 'https://metadata.datadrivendiscovery.org/types/Attribute',),
# 'use_columns': (),
# 'exclude_columns': ()
# }
],
"inputs": ["column_parser_step"]
},
{
"name":"data_clean_step",
"primitives"["d3m.primitives.data_cleaning.imputer.SKlearn"],
"name":"data_clean_step", # step 3
"primitives":["d3m.primitives.data_cleaning.imputer.SKlearn"],
"inputs":["extract_attribute_step"]
},

Expand All @@ -2829,11 +2829,24 @@ def __init__(self):
{
"primitive": "d3m.primitives.cmu.fastlvm.GMM",
"hyperparameters": {
"k": [(4), (6), (8), (10), (12)]
"k": [(1), (4), (6), (8), (10), (12)]
}
}
],
"inputs": ["data_clean_step"]
},
{
"name": "output_step",
"primitives": [
{
"primitive": "d3m.primitives.data_transformation.construct_predictions.DataFrameCommon",
"reference": {
"type": "CONTAINER",
"data": "steps.0.produce"
}
}
],
"inputs": ["model_step"]
}
]
}
Expand Down
10 changes: 10 additions & 0 deletions python/dsbox/template/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(self):
"<class 'd3m.container.pandas.DataFrame'>"): "d3m.primitives.data.NDArrayToDataFrame"
}
self.description_info = ""
self.need_add_reference = False

# Need to be set by subclass inheriting DSBoxTemplate
# self.template = ""

Expand Down Expand Up @@ -195,11 +197,15 @@ def add_intermediate_type_casting(
"'s inputs does not match",
binding[in_arg][-1]["primitive"],
"and there is no converter found")

# temporary fix for CMU clustering tempalte (with special input called "reference")

mystep = {
"primitive": binding[name]["primitive"],
"hyperparameters": binding[name]["hyperparameters"],
"inputs": fill_in
}

if "runtime" in step:
mystep["runtime"] = step["runtime"]

Expand Down Expand Up @@ -291,6 +297,10 @@ def _to_pipeline(self, binding, sequence) -> Pipeline:
# argument_type should be fixed type not the type of the data!!
name=hyperName, argument_type=self.argmentsmapper["value"],
data=hyper[hyperName])

if self.need_add_reference and primitive_name == 'd3m.primitives.data_transformation.construct_predictions.DataFrameCommon':
primitive_step.add_argument("reference",metadata_base.ArgumentType.CONTAINER,"steps.0.produce")

templateIO = binding[step]["inputs"]

# first we need to extract the types of the primtive's input and
Expand Down

0 comments on commit 42e8b5f

Please sign in to comment.