Skip to content

Commit

Permalink
value modifier for translate column
Browse files Browse the repository at this point in the history
related issue: #94
  • Loading branch information
semio committed Aug 1, 2019
1 parent 9f77c70 commit d3366f4
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 8 deletions.
19 changes: 14 additions & 5 deletions ddf_utils/chef/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from . import ops
from ..model.package import DDFcsv

from ddf_utils.str import to_concept_id


memory = Memory(location=mkdtemp(), verbose=0)

Expand Down Expand Up @@ -51,8 +53,15 @@ def dsk_to_pandas(data):
return data


def build_dictionary(chef, dict_def, ignore_case=False):
def build_dictionary(chef, dict_def, ignore_case=False, value_modifier=None):
"""build a dictionary from a dictionary definition"""

def modify(d):
if value_modifier:
return dict((k, value_modifier(v)) for k, v in d.items())
else:
return d

if (len(dict_def) == 3 and
'base' in dict_def and
'key' in dict_def and
Expand All @@ -72,18 +81,18 @@ def build_dictionary(chef, dict_def, ignore_case=False):
res[k.lower()] = v
else:
res = di.copy()
return res
return modify(res)
elif ingredient.dtype == 'entities':
df = ingredient.get_data()[ingredient.key]
return build_dictionary_from_dataframe(df, keys, value, ignore_case)
return modify(build_dictionary_from_dataframe(df, keys, value, ignore_case))
else:
raise NotImplementedError('unsupported data type {}'.format(ingredient.dtype))
elif isinstance(dict_def, str):
base_path = chef.config['dictionaries_dir']
path = os.path.join(base_path, dict_def)
return build_dictionary_from_file(path)
return modify(build_dictionary_from_file(path))
else:
return dict_def
return modify(dict_def)


def build_dictionary_from_file(file_path):
Expand Down
21 changes: 18 additions & 3 deletions ddf_utils/chef/procedure/translate_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
import logging
from typing import List

from .. helpers import debuggable, build_dictionary
from .. helpers import debuggable, build_dictionary, read_opt
from .. model.ingredient import Ingredient, get_ingredient_class
from .. model.chef import Chef

from ddf_utils.str import to_concept_id

logger = logging.getLogger('translate_column')


@debuggable
def translate_column(chef: Chef, ingredients: List[Ingredient], result, dictionary,
column, *, target_column=None, not_found='drop',
ambiguity='prompt', ignore_case=False) -> Ingredient:
ambiguity='prompt', ignore_case=False, value_modifier=None) -> Ingredient:
"""Translate column values.
Procedure format:
Expand Down Expand Up @@ -68,6 +70,8 @@ def translate_column(chef: Chef, ingredients: List[Ingredient], result, dictiona
the behavior when there is values not found in the mapping dictionary, default is 'drop'
ambiguity : {'prompt', 'skip', 'error'}, optional
the behavior when there is ambiguity in the dictionary, default is 'prompt'
value_modifier : `str`, optional
a function to modify new column values, default is None
See Also
--------
Expand All @@ -86,11 +90,22 @@ def translate_column(chef: Chef, ingredients: List[Ingredient], result, dictiona
di = ingredient.get_data()
new_data = dict()

# modifier
value_modifier = read_opt(dictionary, 'value_modifier', default=None, method='pop')
if value_modifier == 'to_concept_id':
modifier = to_concept_id
else:
# TODO: accept more modifiers
logger.warning("for now only `to_concept_id` is accepted")
modifier = None

# build the dictionary
dictionary_ = build_dictionary(chef, dictionary, ignore_case)
dictionary_ = build_dictionary(chef, dictionary, ignore_case, value_modifier=modifier)
dict_type = 'inline'
base_df = None

# modifier

for k, df in di.items():
logger.debug("running on: " + k)
if df.dtypes[column].name == "category":
Expand Down
1 change: 1 addition & 0 deletions tests/chef/recipes/test_translate_column.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ cooking:
'arb1', 'arb2', 'arb3', 'arb4',
'arb5', 'arb6', 'name']
value: country
value_modifier: to_concept_id
not_found: drop
result: geo-aligned
- procedure: translate_column
Expand Down

0 comments on commit d3366f4

Please sign in to comment.