From 86c80487d7bee90d2125bc5cd243f1c1ba87ad8b Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Mon, 8 Jun 2020 22:38:10 +0300 Subject: [PATCH 1/9] increase py version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 03087dc..adc893d 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,6 @@ 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Information Analysis', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], ) From 6e2315a7df51609e745f0dd3354424ab9209e524 Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 00:32:04 +0300 Subject: [PATCH 2/9] add new stuff for the release --- .gitignore | 2 + docs/cooking_machine/config_parser.html | 394 +++-- docs/cooking_machine/cubes/base_cube.html | 68 +- .../cubes/controller_cube.html | 1091 ++++++++---- docs/cooking_machine/cubes/cube_creator.html | 34 +- .../cubes/greedy_strategy.html | 38 +- docs/cooking_machine/cubes/index.html | 65 +- .../cubes/perplexity_strategy.html | 46 +- .../cubes/regularizer_cube.html | 105 +- docs/cooking_machine/cubes/strategy.html | 46 +- docs/cooking_machine/dataset.html | 187 +- docs/cooking_machine/dataset_cooc.html | 34 +- docs/cooking_machine/experiment.html | 274 ++- docs/cooking_machine/index.html | 69 +- docs/cooking_machine/model_constructor.html | 46 +- docs/cooking_machine/model_tracking.html | 88 +- docs/cooking_machine/models/base_model.html | 140 +- .../models/base_regularizer.html | 34 +- docs/cooking_machine/models/base_score.html | 478 +++++- .../models/blei_lafferty_score.html | 45 +- .../models/dummy_topic_model.html | 52 +- .../cooking_machine/models/example_score.html | 57 +- docs/cooking_machine/models/frozen_score.html | 77 +- docs/cooking_machine/models/index.html | 88 +- .../models/intratext_coherence_score.html | 137 +- docs/cooking_machine/models/scores.html | 135 +- .../models/scores_wrapper.html | 46 +- .../models/semantic_radius_score.html | 78 +- .../models/thetaless_regularizer.html | 565 +++++- docs/cooking_machine/models/topic_model.html | 506 ++++-- .../models/topic_prior_regularizer.html | 40 +- docs/cooking_machine/pretty_output.html | 84 +- .../recipes/artm_baseline_pipeline.html | 47 +- .../recipes/exploratory_search_pipeline.html | 24 +- docs/cooking_machine/recipes/index.html | 36 +- .../recipes/intratext_coherence_pipeline.html | 62 +- ...ultimodal_exploratory_search_pipeline.html | 285 +++- .../recipes/recipe_wrapper.html | 169 +- docs/cooking_machine/recipes/wntm.html | 82 +- docs/cooking_machine/rel_toolbox_lite.html | 82 +- docs/cooking_machine/routine.html | 233 ++- docs/dataset_manager/api.html | 76 +- docs/dataset_manager/index.html | 14 +- docs/index.html | 33 +- docs/viewers/base_viewer.html | 40 +- docs/viewers/document_cluster.html | 40 +- docs/viewers/index.html | 106 +- docs/viewers/initial_doc_to_topic_viewer.html | 28 +- docs/viewers/spectrum.html | 100 +- docs/viewers/top_documents_viewer.html | 82 +- .../viewers/top_similar_documents_viewer.html | 54 +- docs/viewers/top_tokens_viewer.html | 159 +- docs/viewers/topic_flow_viewer.html | 52 +- docs/viewers/topic_mapping.html | 54 +- topicnet/README-rus.md | 23 - topicnet/bitbucket-pipelines.yml | 2 +- topicnet/cooking_machine/config_parser.py | 173 +- .../cooking_machine/cubes/controller_cube.py | 231 ++- .../cubes/perplexity_strategy.py | 2 +- .../cooking_machine/cubes/regularizer_cube.py | 25 +- topicnet/cooking_machine/dataset.py | 19 +- topicnet/cooking_machine/experiment.py | 28 +- topicnet/cooking_machine/models/__init__.py | 1 + topicnet/cooking_machine/models/base_score.py | 132 +- .../models/blei_lafferty_score.py | 13 +- .../models/dummy_topic_model.py | 3 + .../cooking_machine/models/example_score.py | 13 +- .../models/intratext_coherence_score.py | 6 +- .../cooking_machine/models/scores_wrapper.py | 6 +- .../models/thetaless_regularizer.py | 230 ++- .../cooking_machine/models/topic_model.py | 117 +- .../recipes/artm_baseline_pipeline.py | 16 + .../recipes/exploratory_search_pipeline.py | 4 + .../intratext_coherence_maximization.yml | 1 + .../recipes/intratext_coherence_pipeline.py | 12 + .../multimodal_exploratory_search_pipeline.py | 90 +- .../cooking_machine/recipes/recipe_wrapper.py | 52 +- topicnet/cooking_machine/routine.py | 32 +- topicnet/dataset_manager/api.py | 18 +- topicnet/demos/README.md | 1 + .../demos/Topic-Thetaless-Regularizer.ipynb | 1290 ++++++++++++++ .../Visualizing-Your-Model-Documents.ipynb | 16 +- .../demos/{ => images}/topic_clusters.html | 0 .../demos/topic_thetaless_regularizer.ipynb | 1512 ----------------- topicnet/tests/test_cube_controller.py | 4 +- topicnet/tests/test_cube_utils.py | 5 +- topicnet/tests/test_cubes.py | 155 +- topicnet/tests/test_dataset.py | 1 - topicnet/tests/test_experiment.py | 6 +- topicnet/tests/test_experiment_restore.py | 295 ++++ topicnet/tests/test_experiment_select.py | 5 +- topicnet/tests/test_pipeline.py | 43 +- topicnet/tests/test_topic_model.py | 146 ++ 93 files changed, 7704 insertions(+), 4031 deletions(-) delete mode 100644 topicnet/README-rus.md create mode 100644 topicnet/demos/Topic-Thetaless-Regularizer.ipynb rename topicnet/demos/{ => images}/topic_clusters.html (100%) delete mode 100644 topicnet/demos/topic_thetaless_regularizer.ipynb create mode 100644 topicnet/tests/test_experiment_restore.py diff --git a/.gitignore b/.gitignore index c77c71b..a17a4bf 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ build/ dist/ +topicnet/dataset_manager/*__internals +topicnet/tests/test_data/*__internals diff --git a/docs/cooking_machine/config_parser.html b/docs/cooking_machine/config_parser.html index 85b3f16..84df31b 100644 --- a/docs/cooking_machine/config_parser.html +++ b/docs/cooking_machine/config_parser.html @@ -3,15 +3,15 @@ - + topicnet.cooking_machine.config_parser API documentation - - + + @@ -43,7 +43,7 @@

Module topicnet.cooking_machine.config_parser

Our process consists of three stages: -1) we check the high-level structure using base_schema. +1) we check the high-level structure using BASE_SCHEMA. The presence of each required key is ensured. After this stage we could be sure than we can create a valid model using specified parameters.

@@ -59,7 +59,9 @@

Module topicnet.cooking_machine.config_parser

-Source code + +Expand source code +
"""
 Parsing text file into Experiment instance using strictyaml
 (github.com/crdoconnor/strictyaml/)
@@ -88,7 +90,7 @@ 

Module topicnet.cooking_machine.config_parser

Module topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parserModule topicnet.cooking_machine.config_parser
@@ -655,20 +730,22 @@

Functions

def build_cube_settings(elemtype, elem_args)
-

Parameters

+

Parameters

elemtype : str
name of regularizer
-
elem_args : strictyaml.YAML object
+
elem_args : strictyaml.YAML object
(contains dict inside)

Returns

list of dict
 
-
+
-Source code + +Expand source code +
def build_cube_settings(elemtype, elem_args):
     """
     Parameters
@@ -698,7 +775,7 @@ 

Returns

def build_experiment_environment_from_yaml_config(yaml_string, experiment_id, save_path, force_separate_thread=False)
-

Wraps up parameter extraction and class instances creation +

Wraps up parameter extraction and class instances creation from yaml formatted string together with the method that builds experiment pipeline from given experiment parameters (model, cubes, regularizers, etc)

@@ -711,20 +788,26 @@

Parameters

path to the folder to save experiment logs and models
experiment_id : str
name of the experiment folder
-
force_separate_thread : bool default = False
+
force_separate_thread : bool default = False
experimental feature that packs model training into separate process which is killed upon training completion by default is not used

Returns

-
tuple experiment, dataset instances of corresponding classes from topicnet
+
tuple experiment, dataset instances of corresponding classes from topicnet
 
-
+
-Source code -
def build_experiment_environment_from_yaml_config(yaml_string, experiment_id,
-                                                  save_path, force_separate_thread=False):
+
+Expand source code
+
+
def build_experiment_environment_from_yaml_config(
+    yaml_string,
+    experiment_id,
+    save_path,
+    force_separate_thread=False,
+):
     """
     Wraps up parameter extraction and class instances creation
     from yaml formatted string
@@ -747,13 +830,14 @@ 

Returns

Returns ------- - tuple experiment, dataset instances of corresponding classes from topicnet + """ settings, regs, model, dataset = parse(yaml_string, force_separate_thread) # TODO: handle dynamic addition of regularizers experiment = Experiment(experiment_id=experiment_id, save_path=save_path, topic_model=model) experiment.build(settings) + return experiment, dataset
@@ -761,22 +845,24 @@

Returns

def build_regularizer(elemtype, elem_args, specific_topic_names, background_topic_names)
-

Parameters

+

Parameters

elemtype : str
name of regularizer
elem_args : dict
 
-
parsed : strictyaml.YAML object
+
parsed : strictyaml.YAML object
 

Returns

instance of artm.Regularizer
 
-
+
-Source code + +Expand source code +
def build_regularizer(elemtype, elem_args, specific_topic_names, background_topic_names):
     """
     Parameters
@@ -807,15 +893,17 @@ 

Returns

def build_schema_for_cubes()
-

Returns

+

Returns

dict
each element is str -> strictyaml.Map where key is name of cube, value is a schema used for validation and type-coercion
-
+
-Source code + +Expand source code +
def build_schema_for_cubes():
     """
     Returns
@@ -863,13 +951,15 @@ 

Returns

def build_schema_for_regs()
-

Returns

+

Returns

strictyaml.Map
schema used for validation and type-coercion
-
+
-Source code + +Expand source code +
def build_schema_for_regs():
     """
     Returns
@@ -897,13 +987,15 @@ 

Returns

def build_schema_for_scores()
-

Returns

+

Returns

strictyaml.Map
schema used for validation and type-coercion
-
+
-Source code + +Expand source code +
def build_schema_for_scores():
     """
     Returns
@@ -935,11 +1027,34 @@ 

Returns

return schemas
+
+def build_schema_from_function(func: Callable) -> dict +
+
+
+
+ +Expand source code + +
def build_schema_from_function(func: Callable) -> dict:
+    from docstring_parser import parse as docstring_parse
+
+    func_params = signature(func).parameters
+    func_params_schema = dict()
+
+    for elem in docstring_parse(func.__doc__).params:
+        if elem.arg_name in func_params:
+            key = choose_key(func_params[elem.arg_name])
+            func_params_schema[key] = TYPE_VALIDATORS[elem.type_name]
+
+    return func_params_schema
+
+
def build_schema_from_signature(class_of_object, use_optional=True)
-

Parameters

+

Parameters

class_of_object : class
 
@@ -948,9 +1063,11 @@

Returns

dict
each element is either str -> Validator or Optional(str) -> Validator
-
+
-Source code + +Expand source code +
def build_schema_from_signature(class_of_object, use_optional=True):
     """
     Parameters
@@ -972,7 +1089,7 @@ 

Returns

def build_score(elemtype, elem_args, is_artm_score)
-

Parameters

+

Parameters

elemtype : str
name of score
@@ -983,11 +1100,13 @@

Returns

Returns

-
instance of artm.scores.BaseScore or topicnet.cooking_machine.models.base_score
+
instance of artm.scores.BaseScore or topicnet.cooking_machine.models.base_score
 
-
+
-Source code + +Expand source code +
def build_score(elemtype, elem_args, is_artm_score):
     """
     Parameters
@@ -1013,7 +1132,7 @@ 

Returns

def choose_key(param)
-

Parameters

+

Parameters

param : inspect.Parameter
 
@@ -1022,9 +1141,11 @@

Returns

str or strictyaml.Optional
 
-
+
-Source code + +Expand source code +
def choose_key(param):
     """
     Parameters
@@ -1037,6 +1158,7 @@ 

Returns

""" if param.default is not Parameter.empty: return Optional(param.name) + return param.name
@@ -1044,7 +1166,7 @@

Returns

def choose_validator(param)
-

Parameters

+

Parameters

param : inspect.Parameter
 
@@ -1053,9 +1175,11 @@

Returns

instance of strictyaml.Validator
 
-
+
-Source code + +Expand source code +
def choose_validator(param):
     """
     Parameters
@@ -1076,6 +1200,7 @@ 

Returns

return Str() if param.name in ARTM_TYPES: return ARTM_TYPES[param.name] + return Any()
@@ -1083,7 +1208,7 @@

Returns

def handle_special_cases(elem_args, kwargs)
-

In-place fixes kwargs, handling special cases and shortcuts +

In-place fixes kwargs, handling special cases and shortcuts (only strategy for now) Parameters


@@ -1092,9 +1217,11 @@

Returns

 
kwargs : dict
 
-
+
-Source code + +Expand source code +
def handle_special_cases(elem_args, kwargs):
     """
     In-place fixes kwargs, handling special cases and shortcuts
@@ -1126,9 +1253,11 @@ 

Returns

def is_key_in_schema(key, schema)
-
+
-Source code + +Expand source code +
def is_key_in_schema(key, schema):
     if key in schema:
         return True
@@ -1139,10 +1268,10 @@ 

Returns

-def parse(yaml_string, force_separate_thread=False, dataset_class=) +def parse(yaml_string: str, force_separate_thread: bool = False, dataset_class: Type[Dataset] = topicnet.cooking_machine.dataset.Dataset)
-

Parameters

+

Parameters

yaml_string : str
 
@@ -1161,9 +1290,11 @@

Returns

 
dataset : Dataset
 
-
+
-Source code + +Expand source code +
def parse(
     yaml_string: str,
     force_separate_thread: bool = False,
@@ -1182,8 +1313,9 @@ 

Returns

regularizers: list topic_model: TopicModel dataset: Dataset + """ - parsed = dirty_load(yaml_string, base_schema, allow_flow_style=True) + parsed = dirty_load(yaml_string, BASE_SCHEMA, allow_flow_style=True) specific_topic_names, background_topic_names = create_default_topics( parsed.data["topics"]["specific_topics"], @@ -1192,12 +1324,22 @@

Returns

revalidate_section(parsed, "stages") revalidate_section(parsed, "regularizers") + if "scores" in parsed: revalidate_section(parsed, "scores") - cube_settings = [] + dataset = dataset_class( + data_path=parsed.data["model"]["dataset_path"], + keep_in_memory=parsed.data["model"].get("keep_in_memory", True), + internals_folder_path=parsed.data["model"].get("internals_folder_path", None), + ) + filter_parameters = parsed.data["model"].get( + KEY_DICTIONARY_FILTER_PARAMETERS, dict() + ) - dataset = dataset_class(parsed.data["model"]["dataset_path"]) + if len(filter_parameters) > 0: + filtered_dictionary = dataset.get_dictionary().filter(**filter_parameters) + dataset._cached_dict = filtered_dictionary modalities_to_use = parse_modalities_data(parsed) @@ -1216,11 +1358,12 @@

Returns

topic_model = TopicModel(model) _add_parsed_scores(parsed, topic_model) + cube_settings = list() + for stage in parsed['stages']: for elemtype, elem_args in stage.items(): settings = build_cube_settings(elemtype.data, elem_args) - if force_separate_thread: - settings[elemtype]["separate_thread"] = False + settings[elemtype]["separate_thread"] = force_separate_thread cube_settings.append(settings) return cube_settings, regularizers, topic_model, dataset
@@ -1230,9 +1373,11 @@

Returns

def parse_modalities_data(parsed)
-
+
-Source code + +Expand source code +
def parse_modalities_data(parsed):
     has_modalities_to_use = is_key_in_schema("modalities_to_use", parsed["model"])
     has_weights = is_key_in_schema("modalities_weights", parsed["model"])
@@ -1240,7 +1385,7 @@ 

Returns

# exactly one should be specified if has_modalities_to_use == has_weights: - raise ValueError(f"Either 'modalities_to_use' or 'modalities_weights' should be specified") + raise ValueError("Either 'modalities_to_use' or 'modalities_weights' should be specified") if has_weights: modalities_to_use = list(parsed["model"]["modalities_weights"].data) @@ -1261,7 +1406,7 @@

Returns

def preprocess_parameters_for_cube_creator(elem_args)
-

This function does two things: +

This function does two things: 1) convert class_ids from name: class_ids@text, values: [0, 1, 2, 3] to @@ -1269,16 +1414,18 @@

Returns

2) type conversion for "values" field.

Parameters

-
elem_args : strictyaml.YAML object
+
elem_args : strictyaml.YAML object
(contains dict inside)

Returns

new_elem_args : dict
 
-
+
-Source code + +Expand source code +
def preprocess_parameters_for_cube_creator(elem_args):
     """
     This function does two things:
@@ -1315,16 +1462,18 @@ 

Returns

def revalidate_section(parsed, section)
-

Perofrms in-place type coercion and validation

+

Perofrms in-place type coercion and validation

Parameters

-
parsed : strictyaml.YAML object
+
parsed : strictyaml.YAML object
(half-parsed, half-validated chunk of config)
section : str
 
-
+
-Source code + +Expand source code +
def revalidate_section(parsed, section):
     """
     Perofrms in-place type coercion and validation
@@ -1359,9 +1508,11 @@ 

Parameters

def wrap_in_map(dictionary)
-
+
-Source code + +Expand source code +
def wrap_in_map(dictionary):
     could_be_empty = all(isinstance(key, Optional) for key in dictionary)
     if could_be_empty:
@@ -1393,6 +1544,7 @@ 

Index

  • build_schema_for_cubes
  • build_schema_for_regs
  • build_schema_for_scores
  • +
  • build_schema_from_function
  • build_schema_from_signature
  • build_score
  • choose_key
  • @@ -1410,7 +1562,7 @@

    Index

    diff --git a/docs/cooking_machine/cubes/base_cube.html b/docs/cooking_machine/cubes/base_cube.html index 08fc64c..9c2787a 100644 --- a/docs/cooking_machine/cubes/base_cube.html +++ b/docs/cooking_machine/cubes/base_cube.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.base_cube API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.cubes.base_cube
    -Source code + +Expand source code +
    import os
     from tqdm import tqdm
     import warnings
    @@ -384,7 +386,7 @@ 

    Functions

    def check_experiment_existence(topic_model)
    -

    Checks if topic_model has experiment.

    +

    Checks if topic_model has experiment.

    Parameters

    topic_model : TopicModel
    @@ -394,9 +396,11 @@

    Returns

    bool
    True if experiment exists, in other case False.
    -
    +
    -Source code + +Expand source code +
    def check_experiment_existence(topic_model):
         """
         Checks if topic_model has experiment.
    @@ -421,9 +425,11 @@ 

    Returns

    def get_from_queue_till_fail(queue, error_message='')
    -
    +
    -Source code + +Expand source code +
    def get_from_queue_till_fail(queue,  error_message='',):
         return queue.get()
    @@ -432,9 +438,11 @@

    Returns

    def put_to_queue(queue, puttable)
    -
    +
    -Source code + +Expand source code +
    def put_to_queue(queue, puttable):
         queue.put(puttable)
    @@ -443,9 +451,11 @@

    Returns

    def retrieve_score_for_strategy(score_name=None)
    -
    +
    -Source code + +Expand source code +
    def retrieve_score_for_strategy(score_name=None):
         if not score_name:
             score_name = 'PerplexityScore@all'
    @@ -468,7 +478,7 @@ 

    Classes

    (num_iter, action=None, reg_search='grid', strategy=None, tracked_score_function=None, verbose=False, separate_thread=True)
    -

    Abstract class for all cubes.

    +

    Abstract class for all cubes.

    Initialize stage. Checks params and update .parameters attribute.

    Parameters

    @@ -489,9 +499,11 @@

    Parameters

    visualization flag
    separate_thread : bool
    will train models inside a separate thread if True
    -
    +
    -Source code + +Expand source code +
    class BaseCube:
         """
         Abstract class for all cubes.
    @@ -785,9 +797,9 @@ 

    Parameters

    Subclasses

    Methods

    @@ -795,12 +807,12 @@

    Methods

    def apply(self, topic_model, one_cube_parameter, dictionary=None, model_id=None)
    -

    "apply" method changes topic_model in way that is defined by one_cube_parameter.

    +

    "apply" method changes topic_model in way that is defined by one_cube_parameter.

    Parameters

    topic_model : TopicModel
    topic model
    -
    one_cube_parameter : optional
    +
    one_cube_parameter : optional
    parameters of one experiment
    dictionary : dict
    dictionary so that the it can be used @@ -808,9 +820,11 @@

    Parameters

    model_id : str
    id of created model if necessary (Default value = None)
    -

    Returns

    +

    Returns

    -Source code + +Expand source code +
    def apply(self, topic_model, one_cube_parameter, dictionary=None, model_id=None):
         """
         "apply" method changes topic_model in way that is defined by one_cube_parameter.
    @@ -838,15 +852,17 @@ 

    Returns

    def get_jsonable_from_parameters(self)
    -

    Transform self.parameters to something that can be downloaded as json.

    +

    Transform self.parameters to something that can be downloaded as json.

    Parameters

    Returns

    -
    optional
    +
    optional
    something jsonable
    -
    +
    -Source code + +Expand source code +
    def get_jsonable_from_parameters(self):
         """
         Transform self.parameters to something that can be downloaded as json.
    @@ -902,7 +918,7 @@ 

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/cubes/controller_cube.html b/docs/cooking_machine/cubes/controller_cube.html index 11c70ff..4488146 100644 --- a/docs/cooking_machine/cubes/controller_cube.html +++ b/docs/cooking_machine/cubes/controller_cube.html @@ -3,15 +3,15 @@ - + topicnet.cooking_machine.cubes.controller_cube API documentation - - + + @@ -21,104 +21,125 @@

    Module topicnet.cooking_machine.cubes.controller_cube

    -

    Allows to add ControllerAgent (with unknown parameters) to the model, which enables user to +

    Allows to add ControllerAgent (with unknown parameters) to the model, which enables user to change tau during the _fit method.

    parameters is a dict with four fields:

    Fields

    -
    -
    reg_name : str
    -
    The name of regularizer. We want to change the tau coefficient of it during training -Note that only one of ("reg_name", "regularizer") should be provided
    -
    regularizer : artm.regularizer.Regularizer
    -
    Regularizer object (if we want to add non-existing regularizer to the model) -Note that only one of ("reg_name", "regularizer") should be provided
    -
    score_to_track : str
    -
    -

    The name of metric which we will track. +

    reg_name: str +The name of regularizer. We want to change the tau coefficient of it during training +Note that only one of ("reg_name", "regularizer") should be provided +regularizer: artm.regularizer.Regularizer +Regularizer object (if we want to add non-existing regularizer to the model) +Note that only one of ("reg_name", "regularizer") should be provided +score_to_track: str +The name of metric which we will track. We assume that if that metric is 'sort of decreasing', then everything is OK and we are allowed to change tau coefficient further; otherwise we revert back to the last "safe" value and stop

    -

    More formal definition of "sort of decreasing": if we divide a curve into two parts like so:

    -
    ##################################### 
    -#. . . .. . . . ..  . .. . .  ... . # 
    -#%. . .  . . . .  .. . . . . .  . ..# 
    -#:t . . . . . . . . . . . . . . .  .# 
    -# t: . . . . . . . . . . . . . . ...# 
    -#. %. . . . . . . . . . . . . . .  .# 
    -#. :t. . . . . . . . .  .  . . . . .# 
    -#.. ;; . .  . . . .  . . . .  . . ..# 
    -#  ..t..  . .  . . . . . . . . . . .# 
    -#. . :t .. . . .  . . . . . . . . ..# 
    -#. .. t: . . . . . . . . . . . . . .# 
    -#.   ..S: . . . . . . . . . . . . ..# 
    -#. . . .:;: . . . . .  . . . . . . .# 
    -#. . .  . :;;  . . . . . . . . . . .# 
    -#. . . . .. :%.      nmmMMmmn   .  .# 
    -# .   . .  . .tt%.ztttt"' '""ttttttt# 
    -#. . .    . . . '"' . . . . . . . . # 
    -##################################### 
    -|                |                  | 
    -|   left part    |                  | 
    -           global minimum           | 
    -                 |     right part   |
    +
    'sort of decreasing' performs best with <code>PerplexityScore</code>,
    +and all scores which behave like perplexity
    +(nonnegative, and which should decrease when a model gets better).
    +If you want to track a different kind of score,
    +it is recommended to use <code>score\_controller</code> parameter
    +
    +More formal definition of "sort of decreasing":
    +if we divide a curve into two parts like so:
    +
    +
    +    ##################################### 
    +    #. . . .. . . . ..  . .. . .  ... . # 
    +    #%. . .  . . . .  .. . . . . .  . ..# 
    +    #:t . . . . . . . . . . . . . . .  .# 
    +    # t: . . . . . . . . . . . . . . ...# 
    +    #. %. . . . . . . . . . . . . . .  .# 
    +    #. :t. . . . . . . . .  .  . . . . .# 
    +    #.. ;; . .  . . . .  . . . .  . . ..# 
    +    #  ..t..  . .  . . . . . . . . . . .# 
    +    #. . :t .. . . .  . . . . . . . . ..# 
    +    #. .. t: . . . . . . . . . . . . . .# 
    +    #.   ..S: . . . . . . . . . . . . ..# 
    +    #. . . .:;: . . . . .  . . . . . . .# 
    +    #. . .  . :;;  . . . . . . . . . . .# 
    +    #. . . . .. :%.      nmmMMmmn   .  .# 
    +    # .   . .  . .tt%.ztttt"' '""ttttttt# 
    +    #. . .    . . . '"' . . . . . . . . # 
    +    ##################################### 
    +    |                |                  | 
    +    |   left part    |                  | 
    +               global minimum           | 
    +                     |     right part   |
    +
    +then the right part is no higher than 5% of global minimum
    +(you can change 5% if you like by adjusting <code>fraction\_threshold</code> parameter)
    +
    +If <code>score\_to\_track</code> is None and <code>score\_controller</code> is None,
    +then <code><a title="topicnet.cooking_machine.cubes.controller_cube.ControllerAgent" href="#topicnet.cooking_machine.cubes.controller_cube.ControllerAgent">ControllerAgent</a></code> will never stop
    +(useful for e.g. decaying coefficients)
     
    -

    then the right part is no higher than 5% of global minimum -(you can change 5% if you like by adjusting fraction_threshold -in is_score_out_of_control() function)

    -

    If score_to_track is None, then ControllerAgent will never stop -(useful for e.g. decaying coefficients)

    -
    -
    tau_converter : str or callable
    -
    -

    Notably, def-style functions and lambda functions are allowed +

    fraction_threshold: float +Threshold to control a score by 'sort of decreasing' metric +score_controller: BaseScoreController +Custom score controller +In case of 'sort of decreasing' is not proper to control score, +you are able to create custom Score Controller +inherited from BaseScoreController. +tau_converter: str or callable +Notably, def-style functions and lambda functions are allowed If it is function, then it should accept four arguments: (initial_tau, prev_tau, cur_iter, user_value) For example:

    -
    >> lambda initial_tau, prev_tau, cur_iter, user_value:
    ->>     initial_tau if cur_iter % 2 == 0 else 0
    -
    -

    (Note that experiment description might display lambda functions incorrectly; -Try to keep them to a single line or use def-style functions instead)

    -
    >> def func(initial_tau, prev_tau, cur_iter, user_value):
    ->>     relu_grower = user_value * (cur_iter - 8) if cur_iter > 8 else 0
    ->>     return 0 if cur_iter % 2 else relu_grower
    +
        >> lambda initial_tau, prev_tau, cur_iter, user_value:
    +    >>     initial_tau if cur_iter % 2 == 0 else 0
    +
    +(Note that experiment description might display lambda functions incorrectly;
    + Try to keep them to a single line or use def-style functions instead)
    +
    +    >> def func(initial_tau, prev_tau, cur_iter, user_value):
    +    >>     relu_grower = user_value * (cur_iter - 8) if cur_iter > 8 else 0
    +    >>     return 0 if cur_iter % 2 else relu_grower
    +
    +If it is a string, then it should be an expression consisting of numbers, operations
    +    and variables (four are allowed: <code>initial\_tau, prev\_tau, cur\_iter, user\_value</code>)
    +For example:
    +
    +`>> "initial_tau * ((cur_iter + 1) % 2)"`
    +
    +or
    +
    +`>> "prev_tau * user_value"`
     
    -

    If it is a string, then it should be an expression consisting of numbers, operations -and variables (four are allowed: initial_tau, prev_tau, cur_iter, user_value) -For example:

    -

    >> "initial_tau * ((cur_iter + 1) % 2)"

    -

    or

    -

    >> "prev_tau * user_value"

    -
    -
    user_value_grid : list of numeric
    -
    -

    Values for user_value variable +

    user_value_grid: list of numeric +Values for user_value variable When writing tau_converter, you can use user_value variable.

    -

    For example:

    -
    >> tau_converter: "prev_tau * user_value"
    ->> user_value_grid: [1, 0.99, 0.95, 0.90, 0.80, 0.5]
    -
    -

    (I know that tau should decay exponentially, but I'm unsure of exact half-life)

    -
    >> tau_converter: "prev_tau + user_value"
    ->> user_value_grid: [50, 100, 150, 200, 250]
    -
    -

    (I know that tau should increase linearly, but I'm unsure of exact speed)

    -
    >> def func(initial_tau, prev_tau, cur_iter, user_value):
    ->>     new_tau = 50 * (cur_iter - user_value) if cur_iter > user_value else 0
    ->>     return new_tau
    ->> tau_converter: func
    ->> user_value_grid: [10, 15, 20, 25, 30]
    +
    For example:
    +
    +    >> tau_converter: "prev_tau * user_value"
    +    >> user_value_grid: [1, 0.99, 0.95, 0.90, 0.80, 0.5]
    +
    +(I know that tau should decay exponentially, but I'm unsure of exact half-life)
    +
    +    >> tau_converter: "prev_tau + user_value"
    +    >> user_value_grid: [50, 100, 150, 200, 250]
    +
    +(I know that tau should increase linearly, but I'm unsure of exact speed)
    +
    +    >> def func(initial_tau, prev_tau, cur_iter, user_value):
    +    >>     new_tau = 50 * (cur_iter - user_value) if cur_iter > user_value else 0
    +    >>     return new_tau
    +    >> tau_converter: func
    +    >> user_value_grid: [10, 15, 20, 25, 30]
    +
    +(Tau should start with zero, then increase linearly. I don't know when to start this process)
     
    -

    (Tau should start with zero, then increase linearly. I don't know when to start this process)

    -
    -
    max_iter : numeric
    -
    Optional (default value is num_iter specified for cube) +

    max_iter: numeric +Optional (default value is num_iter specified for cube) Agent will stop changing tau after max_iters iterations max_iters could be float("NaN") and float("inf") values: -that way agent will continue operating even outside this RegularizationControllerCube

    -
    +that way agent will continue operating even outside this RegularizationControllerCube

    -Source code + +Expand source code +
    """
     Allows to add `ControllerAgent` (with unknown parameters) to the model, which enables user to
     change `tau` during the `_fit` method.
    @@ -140,7 +161,14 @@ 

    Fields

    and we are allowed to change tau coefficient further; otherwise we revert back to the last "safe" value and stop - More formal definition of "sort of decreasing": if we divide a curve into two parts like so: + 'sort of decreasing' performs best with `PerplexityScore`, + and all scores which behave like perplexity + (nonnegative, and which should decrease when a model gets better). + If you want to track a different kind of score, + it is recommended to use `score_controller` parameter + + More formal definition of "sort of decreasing": + if we divide a curve into two parts like so: ##################################### @@ -167,12 +195,18 @@

    Fields

    | right part | then the right part is no higher than 5% of global minimum - (you can change 5% if you like by adjusting `fraction_threshold` - in `is_score_out_of_control` function) + (you can change 5% if you like by adjusting `fraction_threshold` parameter) - If score_to_track is None, then `ControllerAgent` will never stop + If `score_to_track` is None and `score_controller` is None, + then `ControllerAgent` will never stop (useful for e.g. decaying coefficients) - +fraction_threshold: float + Threshold to control a score by 'sort of decreasing' metric +score_controller: BaseScoreController + Custom score controller + In case of 'sort of decreasing' is not proper to control score, + you are able to create custom Score Controller + inherited from `BaseScoreController`. tau_converter: str or callable Notably, def-style functions and lambda functions are allowed If it is function, then it should accept four arguments: @@ -230,58 +264,112 @@

    Fields

    that way agent will continue operating even outside this `RegularizationControllerCube` """ # noqa: W291 -from .base_cube import BaseCube -from ..rel_toolbox_lite import count_vocab_size, handle_regularizer - -import numexpr as ne import warnings -from dill.source import getsource from copy import deepcopy +from dataclasses import dataclass +from numbers import Number +from typing import ( + Callable, + List, + Optional, + Union, +) + +import numexpr as ne import numpy as np +from dill.source import getsource +from .base_cube import BaseCube +from ..models.base_regularizer import BaseRegularizer +from ..rel_toolbox_lite import count_vocab_size, handle_regularizer W_HALT_CONTROL = "Process of dynamically changing tau was stopped at {} iteration" W_MAX_ITERS = "Maximum number of iterations is exceeded; turning off" -def is_score_out_of_control(model, score_name, fraction_threshold=0.05): - """ - Returns True if score isn't 'sort of decreasing' anymore. +@dataclass +class OutOfControlAnswer: + answer: bool + error_message: Optional[str] = None - See docstring for RegularizationControllerCube for details - Parameters - ---------- - model : TopicModel - score_name : str or None - fraction_threshold : float +class BaseScoreController: + def __init__(self, score_name): + self.score_name = score_name - Returns - ------- - bool + def get_score_values(self, model): + if self.score_name not in model.scores: # case of None is handled here as well + return None + + vals = model.scores[self.score_name] + + if len(vals) == 0: + return None + + return vals + def __call__(self, model): + values = self.get_score_values(model) + + if values is None: + return False + + try: + out_of_control_result = self.is_out_of_control(values) + except Exception as ex: + raise ValueError( + f"An error occurred while controlling {self.score_name}!" + f" Message: {ex}. Score values: {values}" + ) + + if out_of_control_result.error_message is not None: + warnings.warn(out_of_control_result.error_message) + + return out_of_control_result.answer + + def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer: + raise NotImplementedError + + +class PerplexityScoreController(BaseScoreController): + """ + Controller is proper to control the Perplexity score. + For others, please ensure for yourself. """ + DEFAULT_FRACTION_THRESHOLD = 0.05 - if score_name not in model.scores: # case of None is handled here as well - return False + def __init__(self, score_name, fraction_threshold=DEFAULT_FRACTION_THRESHOLD): + super().__init__(score_name) + self.fraction_threshold = fraction_threshold - vals = model.scores[score_name] - if len(vals) == 0: - return False + def is_out_of_control(self, values: List[float]): + idxmin = np.argmin(values) - idxmin = np.argmin(vals) + if idxmin == len(values): # score is monotonically decreasing + return False - if idxmin == len(vals): # score is monotonically decreasing - return False - maxval = max(vals[idxmin:]) - minval = vals[idxmin] - answer = ((maxval - minval)/abs(minval) - 1.0) > fraction_threshold - if answer: - msg = (f"Score {score_name} is too high: during training the value {maxval}" - f" passed a treshold of {(1 + fraction_threshold) * minval}" - f" (estimate is based on {idxmin} iteration)") - warnings.warn(msg) - return answer + right_maxval = max(values[idxmin:]) + minval = values[idxmin] + + if minval <= 0: + raise ValueError( + f'Score "{self.score_name}" has min_value = {minval} which is <= 0.' + f' This control scheme is using to control scores acting like Perplexity.' + f' Ensure you control the Perplexity score or write your own controller!' + ) + + answer = (right_maxval - minval) / minval > self.fraction_threshold + + if answer: + return OutOfControlAnswer( + answer=answer, + error_message=( + f"Score {self.score_name} is too high!" + f" Right max value: {right_maxval}, min value: {minval}" + ), + ) + + return OutOfControlAnswer(answer=answer) class ControllerAgent: @@ -293,8 +381,10 @@

    Fields

    Each agent is described by: * reg_name: the name of regularizer having `tau` which needs to be changed - * score_to_track: score providing control of the callback execution * tau_converter: function or string describing how to get new `tau` from old `tau` + * score_to_track: score name providing control of the callback execution + * fraction_threshold: threshold to control score_to_track + * score_controller: custom score controller providing control of the callback execution * local_dict: dictionary containing values of several variables, most notably, `user_value` * is_working: @@ -304,37 +394,125 @@

    Fields

    See top-level docstring for details. """ - def __init__(self, reg_name, score_to_track, tau_converter, max_iters, local_dict=None): + + def __init__( + self, + reg_name: str, + tau_converter: Callable or str, + max_iters: int or float, + score_to_track: Union[str, List[str], None] = None, + fraction_threshold: Union[float, List[float], None] = None, + score_controller: Union[BaseScoreController, List[BaseScoreController], None] = None, + local_dict: dict = None): """ Parameters ---------- - reg_name : str - score_to_track : str, list of str or None - tau_converter : callable or str - local_dict : dict - max_iters : int or float - Agent will stop changing tau after `max_iters` iterations + reg_name + tau_converter + max_iters + Agent will stop changing tau after `max_iters` iterations, `max_iters` could be `float("NaN")` and `float("inf")` values: that way agent will continue operating even outside this `RegularizationControllerCube` + score_to_track + Name of score to track. + Please, use this definition to track only scores of type PerplexityScore. + In other cases we recommend you to write you own ScoreController + fraction_threshold + Uses to define threshold to control PerplexityScore + Default value is 0.05. + If `fraction_threshold` is a list, it should be of the same length, as `score_to_track`. + score_controller + Score controller or controllers. + One can use this parameter for scores other than Perplexity + (or other scores that behave like Perplexity). + This is a more flexible and customizable way to control scores. + local_dict """ if local_dict is None: local_dict = dict() self.reg_name = reg_name self.tau_converter = tau_converter - if isinstance(score_to_track, list): - self.score_to_track = score_to_track - elif isinstance(score_to_track, str): - self.score_to_track = [score_to_track] - else: - self.score_to_track = [] + + scores_to_track = self._validate_score_to_track(score_to_track) + fraction_thresholds = self._validate_fraction_threshold( + fraction_threshold, required_length=len(scores_to_track) + ) + + assert len(scores_to_track) == len(fraction_thresholds) + + perplexity_like_score_controllers = [ + PerplexityScoreController(name, threshold) + for (name, threshold) in zip(scores_to_track, fraction_thresholds) + ] + + self.score_controllers = list() + self.score_controllers.extend(perplexity_like_score_controllers) + self.score_controllers.extend( + self._validate_score_controller(score_controller) + ) self.is_working = True self.local_dict = local_dict self.tau_history = [] self.max_iters = max_iters + @staticmethod + def _validate_score_to_track( + score_to_track: Union[str, List[str], None]) -> List[str]: + + if isinstance(score_to_track, list): + return score_to_track + if score_to_track is None: + return list() + if isinstance(score_to_track, str): + return [score_to_track] + + raise TypeError(f'Wrong type of `score_to_track`: "{type(score_to_track)}"!') + + @staticmethod + def _validate_fraction_threshold( + fraction_threshold: Union[float, List[float], None], + required_length: int, + ) -> List[float]: + + if fraction_threshold is None: + return [PerplexityScoreController.DEFAULT_FRACTION_THRESHOLD] * required_length + if isinstance(fraction_threshold, Number): + return [float(fraction_threshold)] * required_length + + if not isinstance(fraction_threshold, list): + raise TypeError( + f'Wrong type of `fraction_threshold`: "{type(fraction_threshold)}"!' + ) + + if len(fraction_threshold) != required_length: + raise ValueError( + f'Wrong length of `fraction_threshold`: {len(fraction_threshold)}!' + f' Expected the length to be equal to {required_length}.' + ) + + return fraction_threshold + + @staticmethod + def _validate_score_controller( + score_controller: Union[BaseScoreController, List[BaseScoreController], None] + ) -> List[BaseScoreController]: + + if score_controller is None: + return list() + + elif isinstance(score_controller, BaseScoreController): + return [score_controller] + + elif (not isinstance(score_controller, list) or not all( + isinstance(score, BaseScoreController) for score in score_controller)): + raise TypeError(f'Wrong type of `score_controller`: "{type(score_controller)}"!') + + else: + return score_controller + def _convert_tau(self): """ """ if isinstance(self.tau_converter, str): @@ -365,7 +543,7 @@

    Fields

    Note that zero means "cube just started", not "the model is brand new" """ - current_tau = model.regularizers[self.reg_name].tau + current_tau = model.get_regularizer(self.reg_name).tau self.tau_history.append(current_tau) self.local_dict["prev_tau"] = current_tau self.local_dict["cur_iter"] = cur_iter @@ -379,14 +557,14 @@

    Fields

    if self.is_working: should_stop = any( - is_score_out_of_control(model, score) for score in self.score_to_track + score_controller(model) for score_controller in self.score_controllers ) if should_stop: warnings.warn(W_HALT_CONTROL.format(len(self.tau_history))) self.is_working = False - model.regularizers[self.reg_name].tau = self._find_safe_tau() + model.get_regularizer(self.reg_name).tau = self._find_safe_tau() else: - model.regularizers[self.reg_name].tau = self._convert_tau() + model.get_regularizer(self.reg_name).tau = self._convert_tau() class RegularizationControllerCube(BaseCube): @@ -404,26 +582,33 @@

    Fields

    regularizers params each dict should contain the following fields: ("reg_name" or "regularizer"), - "score_to_track" (optional), "tau_converter", + "score_to_track" (optional), + "fraction_threshold" (optional), + "score_controller" (optional), "user_value_grid" See top-level docstring for details. Examples: >> {"regularizer": artm.regularizers.<...>, - >> "score_to_track": "PerplexityScore@all", >> "tau_converter": "prev_tau * user_value", + >> "score_to_track": "PerplexityScore@all", + >> "fraction_threshold": 0.1, >> "user_value_grid": [0.5, 1, 2]} ----------- >> {"reg_name": "decorrelator_for_ngramms", - >> "score_to_track": None, >> "tau_converter": ( >> lambda initial_tau, prev_tau, cur_iter, user_value: >> initial_tau * (cur_iter % 2) + user_value >> ) + >> "score_to_track": None, + >> "fraction_threshold": None, + >> "score_controller": [ + >> PerplexityScoreController("PerplexityScore@all", 0.1) + >> ], >> "user_value_grid": [0, 1]} reg_search : str @@ -450,9 +635,9 @@

    Fields

    separate_thread=separate_thread) self._relative = use_relative_coefficients self.data_stats = None - self.raw_parameters = parameters if isinstance(parameters, dict): parameters = [parameters] + self.raw_parameters = parameters self._convert_parameters(parameters) def _convert_parameters(self, all_parameters): @@ -513,22 +698,29 @@

    Fields

    for (agent_blueprint_template, field_name, current_user_value) in one_model_parameter: agent_blueprint = dict(agent_blueprint_template) - if agent_blueprint["reg_name"] is None: - regularizer = agent_blueprint["regularizer"] - new_regularizer = deepcopy(regularizer) - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) - agent_blueprint["reg_name"] = new_regularizer.name - else: - if agent_blueprint['reg_name'] not in new_model.regularizers.data: + if agent_blueprint.get("reg_name") is not None: + reg_name = agent_blueprint['reg_name'] + + if reg_name not in new_model.all_regularizers: error_msg = (f"Regularizer {agent_blueprint['reg_name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif agent_blueprint.get("regularizer") is not None: + regularizer = agent_blueprint["regularizer"] + new_regularizer = deepcopy(regularizer) + if isinstance(regularizer, BaseRegularizer): + new_model.custom_regularizers[new_regularizer.name] = new_regularizer + else: # classic bigARTM regularizer, attempt to relativize it's coefficients + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) + agent_blueprint["reg_name"] = new_regularizer.name + else: + raise ValueError("Either 'reg_name' or 'regularizer' should be set") agent_blueprint['local_dict']['user_value'] = current_user_value # ControllerAgent needs only reg_name in constructor agent_blueprint.pop("regularizer") @@ -562,88 +754,114 @@

    Fields

    -

    Functions

    +
    +
    +

    Classes

    -
    -def is_score_out_of_control(model, score_name, fraction_threshold=0.05) +
    +class BaseScoreController +(score_name)
    -

    Returns True if score isn't 'sort of decreasing' anymore.

    -

    See docstring for RegularizationControllerCube for details

    -

    Parameters

    -
    -
    model : TopicModel
    -
     
    -
    score_name : str or None
    -
     
    -
    fraction_threshold : float
    -
     
    -
    -

    Returns

    -
    -
    bool
    -
     
    -
    +
    -Source code -
    def is_score_out_of_control(model, score_name, fraction_threshold=0.05):
    -    """
    -    Returns True if score isn't 'sort of decreasing' anymore.
    +
    +Expand source code
    +
    +
    class BaseScoreController:
    +    def __init__(self, score_name):
    +        self.score_name = score_name
     
    -    See docstring for RegularizationControllerCube for details
    +    def get_score_values(self, model):
    +        if self.score_name not in model.scores:  # case of None is handled here as well
    +            return None
     
    -    Parameters
    -    ----------
    -    model : TopicModel
    -    score_name : str or None
    -    fraction_threshold : float
    +        vals = model.scores[self.score_name]
     
    -    Returns
    -    -------
    -    bool
    +        if len(vals) == 0:
    +            return None
     
    -    """
    +        return vals
     
    -    if score_name not in model.scores:  # case of None is handled here as well
    -        return False
    +    def __call__(self, model):
    +        values = self.get_score_values(model)
     
    -    vals = model.scores[score_name]
    -    if len(vals) == 0:
    -        return False
    +        if values is None:
    +            return False
     
    -    idxmin = np.argmin(vals)
    +        try:
    +            out_of_control_result = self.is_out_of_control(values)
    +        except Exception as ex:
    +            raise ValueError(
    +                f"An error occurred while controlling {self.score_name}!"
    +                f" Message: {ex}. Score values: {values}"
    +            )
     
    -    if idxmin == len(vals):  # score is monotonically decreasing
    -        return False
    -    maxval = max(vals[idxmin:])
    -    minval = vals[idxmin]
    -    answer = ((maxval - minval)/abs(minval) - 1.0) > fraction_threshold
    -    if answer:
    -        msg = (f"Score {score_name} is too high: during training the value {maxval}"
    -               f" passed a treshold of {(1 + fraction_threshold) * minval}"
    -               f" (estimate is based on {idxmin} iteration)")
    -        warnings.warn(msg)
    -    return answer
    + if out_of_control_result.error_message is not None: + warnings.warn(out_of_control_result.error_message) + + return out_of_control_result.answer + + def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer: + raise NotImplementedError
    +
    +

    Subclasses

    + +

    Methods

    +
    +
    +def get_score_values(self, model) +
    +
    +
    +
    + +Expand source code + +
    def get_score_values(self, model):
    +    if self.score_name not in model.scores:  # case of None is handled here as well
    +        return None
    +
    +    vals = model.scores[self.score_name]
    +
    +    if len(vals) == 0:
    +        return None
    +
    +    return vals
    +
    +
    +
    +def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer +
    +
    +
    +
    + +Expand source code + +
    def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer:
    +    raise NotImplementedError
    -
    -
    -

    Classes

    -
    +

    class ControllerAgent -(reg_name, score_to_track, tau_converter, max_iters, local_dict=None) +(reg_name: str, tau_converter: Callable, max_iters: int, score_to_track: Union[str, List[str], NoneType] = None, fraction_threshold: Union[float, List[float], NoneType] = None, score_controller: Union[topicnet.cooking_machine.cubes.controller_cube.BaseScoreController, List[topicnet.cooking_machine.cubes.controller_cube.BaseScoreController], NoneType] = None, local_dict: dict = None)
    -

    Allows to change tau during the _fit method.

    +

    Allows to change tau during the _fit method.

    Each TopicModel has a .callbacks attribute. -This is a list consisting of various ControllerAgents. +This is a list consisting of various ControllerAgents. Each agent is described by:

    • reg_name: the name of regularizer having tau which needs to be changed
    • -
    • score_to_track: score providing control of the callback execution
    • tau_converter: function or string describing how to get new tau from old tau
    • +
    • score_to_track: score name providing control of the callback execution
    • +
    • fraction_threshold: threshold to control score_to_track
    • +
    • score_controller: custom score controller providing control of the callback execution
    • local_dict: dictionary containing values of several variables, most notably, user_value
    • is_working: @@ -654,21 +872,34 @@

      Classes

      See top-level docstring for details.

      Parameters

      -
      reg_name : str
      -
       
      -
      score_to_track : str, list of str or None
      +
      reg_name
       
      -
      tau_converter : callable or str
      +
      tau_converter
       
      -
      local_dict : dict
      -
       
      -
      max_iters : int or float
      -
      Agent will stop changing tau after max_iters iterations +
      max_iters
      +
      Agent will stop changing tau after max_iters iterations, max_iters could be float("NaN") and float("inf") values: -that way agent will continue operating even outside this RegularizationControllerCube
      -
    +that way agent will continue operating even outside this RegularizationControllerCube
    +
    score_to_track
    +
    Name of score to track. +Please, use this definition to track only scores of type PerplexityScore. +In other cases we recommend you to write you own ScoreController
    +
    fraction_threshold
    +
    Uses to define threshold to control PerplexityScore +Default value is 0.05. +If fraction_threshold is a list, it should be of the same length, as score_to_track.
    +
    score_controller
    +
    Score controller or controllers. +One can use this parameter for scores other than Perplexity +(or other scores that behave like Perplexity). +This is a more flexible and customizable way to control scores.
    +
    local_dict
    +
     
    +
    -Source code + +Expand source code +
    class ControllerAgent:
         """
         Allows to change `tau` during the `_fit` method.
    @@ -678,8 +909,10 @@ 

    Parameters

    Each agent is described by: * reg_name: the name of regularizer having `tau` which needs to be changed - * score_to_track: score providing control of the callback execution * tau_converter: function or string describing how to get new `tau` from old `tau` + * score_to_track: score name providing control of the callback execution + * fraction_threshold: threshold to control score_to_track + * score_controller: custom score controller providing control of the callback execution * local_dict: dictionary containing values of several variables, most notably, `user_value` * is_working: @@ -689,37 +922,125 @@

    Parameters

    See top-level docstring for details. """ - def __init__(self, reg_name, score_to_track, tau_converter, max_iters, local_dict=None): + + def __init__( + self, + reg_name: str, + tau_converter: Callable or str, + max_iters: int or float, + score_to_track: Union[str, List[str], None] = None, + fraction_threshold: Union[float, List[float], None] = None, + score_controller: Union[BaseScoreController, List[BaseScoreController], None] = None, + local_dict: dict = None): """ Parameters ---------- - reg_name : str - score_to_track : str, list of str or None - tau_converter : callable or str - local_dict : dict - max_iters : int or float - Agent will stop changing tau after `max_iters` iterations + reg_name + tau_converter + max_iters + Agent will stop changing tau after `max_iters` iterations, `max_iters` could be `float("NaN")` and `float("inf")` values: that way agent will continue operating even outside this `RegularizationControllerCube` + score_to_track + Name of score to track. + Please, use this definition to track only scores of type PerplexityScore. + In other cases we recommend you to write you own ScoreController + fraction_threshold + Uses to define threshold to control PerplexityScore + Default value is 0.05. + If `fraction_threshold` is a list, it should be of the same length, as `score_to_track`. + score_controller + Score controller or controllers. + One can use this parameter for scores other than Perplexity + (or other scores that behave like Perplexity). + This is a more flexible and customizable way to control scores. + local_dict """ if local_dict is None: local_dict = dict() self.reg_name = reg_name self.tau_converter = tau_converter - if isinstance(score_to_track, list): - self.score_to_track = score_to_track - elif isinstance(score_to_track, str): - self.score_to_track = [score_to_track] - else: - self.score_to_track = [] + + scores_to_track = self._validate_score_to_track(score_to_track) + fraction_thresholds = self._validate_fraction_threshold( + fraction_threshold, required_length=len(scores_to_track) + ) + + assert len(scores_to_track) == len(fraction_thresholds) + + perplexity_like_score_controllers = [ + PerplexityScoreController(name, threshold) + for (name, threshold) in zip(scores_to_track, fraction_thresholds) + ] + + self.score_controllers = list() + self.score_controllers.extend(perplexity_like_score_controllers) + self.score_controllers.extend( + self._validate_score_controller(score_controller) + ) self.is_working = True self.local_dict = local_dict self.tau_history = [] self.max_iters = max_iters + @staticmethod + def _validate_score_to_track( + score_to_track: Union[str, List[str], None]) -> List[str]: + + if isinstance(score_to_track, list): + return score_to_track + if score_to_track is None: + return list() + if isinstance(score_to_track, str): + return [score_to_track] + + raise TypeError(f'Wrong type of `score_to_track`: "{type(score_to_track)}"!') + + @staticmethod + def _validate_fraction_threshold( + fraction_threshold: Union[float, List[float], None], + required_length: int, + ) -> List[float]: + + if fraction_threshold is None: + return [PerplexityScoreController.DEFAULT_FRACTION_THRESHOLD] * required_length + if isinstance(fraction_threshold, Number): + return [float(fraction_threshold)] * required_length + + if not isinstance(fraction_threshold, list): + raise TypeError( + f'Wrong type of `fraction_threshold`: "{type(fraction_threshold)}"!' + ) + + if len(fraction_threshold) != required_length: + raise ValueError( + f'Wrong length of `fraction_threshold`: {len(fraction_threshold)}!' + f' Expected the length to be equal to {required_length}.' + ) + + return fraction_threshold + + @staticmethod + def _validate_score_controller( + score_controller: Union[BaseScoreController, List[BaseScoreController], None] + ) -> List[BaseScoreController]: + + if score_controller is None: + return list() + + elif isinstance(score_controller, BaseScoreController): + return [score_controller] + + elif (not isinstance(score_controller, list) or not all( + isinstance(score, BaseScoreController) for score in score_controller)): + raise TypeError(f'Wrong type of `score_controller`: "{type(score_controller)}"!') + + else: + return score_controller + def _convert_tau(self): """ """ if isinstance(self.tau_converter, str): @@ -750,7 +1071,7 @@

    Parameters

    Note that zero means "cube just started", not "the model is brand new" """ - current_tau = model.regularizers[self.reg_name].tau + current_tau = model.get_regularizer(self.reg_name).tau self.tau_history.append(current_tau) self.local_dict["prev_tau"] = current_tau self.local_dict["cur_iter"] = cur_iter @@ -764,14 +1085,14 @@

    Parameters

    if self.is_working: should_stop = any( - is_score_out_of_control(model, score) for score in self.score_to_track + score_controller(model) for score_controller in self.score_controllers ) if should_stop: warnings.warn(W_HALT_CONTROL.format(len(self.tau_history))) self.is_working = False - model.regularizers[self.reg_name].tau = self._find_safe_tau() + model.get_regularizer(self.reg_name).tau = self._find_safe_tau() else: - model.regularizers[self.reg_name].tau = self._convert_tau()
    + model.get_regularizer(self.reg_name).tau = self._convert_tau()

    Methods

    @@ -779,16 +1100,18 @@

    Methods

    def invoke(self, model, cur_iter)
    -

    Attempts to change tau if is_working == True. Otherwise, keeps to the last safe value.

    +

    Attempts to change tau if is_working == True. Otherwise, keeps to the last safe value.

    Parameters

    model : TopicModel
     
    cur_iter : int
    Note that zero means "cube just started", not "the model is brand new"
    -
    +
    -Source code + +Expand source code +
    def invoke(self, model, cur_iter):
         """
         Attempts to change tau if `is_working == True`. Otherwise, keeps to the last safe value.
    @@ -800,7 +1123,7 @@ 

    Parameters

    Note that zero means "cube just started", not "the model is brand new" """ - current_tau = model.regularizers[self.reg_name].tau + current_tau = model.get_regularizer(self.reg_name).tau self.tau_history.append(current_tau) self.local_dict["prev_tau"] = current_tau self.local_dict["cur_iter"] = cur_iter @@ -814,51 +1137,185 @@

    Parameters

    if self.is_working: should_stop = any( - is_score_out_of_control(model, score) for score in self.score_to_track + score_controller(model) for score_controller in self.score_controllers ) if should_stop: warnings.warn(W_HALT_CONTROL.format(len(self.tau_history))) self.is_working = False - model.regularizers[self.reg_name].tau = self._find_safe_tau() + model.get_regularizer(self.reg_name).tau = self._find_safe_tau() else: - model.regularizers[self.reg_name].tau = self._convert_tau()
    + model.get_regularizer(self.reg_name).tau = self._convert_tau()
    +
    +
    + +
    +
    +class OutOfControlAnswer +(answer: bool, error_message: Union[str, NoneType] = None) +
    +
    +

    OutOfControlAnswer(answer:bool, error_message:Union[str, NoneType]=None)

    +
    + +Expand source code + +
    class OutOfControlAnswer:
    +    answer: bool
    +    error_message: Optional[str] = None
    +
    +

    Class variables

    +
    +
    var error_message : Union[str, NoneType]
    +
    +
    +
    +
    +
    +
    +class PerplexityScoreController +(score_name, fraction_threshold=0.05) +
    +
    +

    Controller is proper to control the Perplexity score. +For others, please ensure for yourself.

    +
    + +Expand source code + +
    class PerplexityScoreController(BaseScoreController):
    +    """
    +    Controller is proper to control the Perplexity score.
    +    For others, please ensure for yourself.
    +    """
    +    DEFAULT_FRACTION_THRESHOLD = 0.05
    +
    +    def __init__(self, score_name, fraction_threshold=DEFAULT_FRACTION_THRESHOLD):
    +        super().__init__(score_name)
    +        self.fraction_threshold = fraction_threshold
    +
    +    def is_out_of_control(self, values: List[float]):
    +        idxmin = np.argmin(values)
    +
    +        if idxmin == len(values):  # score is monotonically decreasing
    +            return False
    +
    +        right_maxval = max(values[idxmin:])
    +        minval = values[idxmin]
    +
    +        if minval <= 0:
    +            raise ValueError(
    +                f'Score "{self.score_name}" has min_value = {minval} which is <= 0.'
    +                f' This control scheme is using to control scores acting like Perplexity.'
    +                f' Ensure you control the Perplexity score or write your own controller!'
    +            )
    +
    +        answer = (right_maxval - minval) / minval > self.fraction_threshold
    +
    +        if answer:
    +            return OutOfControlAnswer(
    +                answer=answer,
    +                error_message=(
    +                    f"Score {self.score_name} is too high!"
    +                    f" Right max value: {right_maxval}, min value: {minval}"
    +                ),
    +            )
    +
    +        return OutOfControlAnswer(answer=answer)
    +
    +

    Ancestors

    + +

    Class variables

    +
    +
    var DEFAULT_FRACTION_THRESHOLD
    +
    +
    +
    +
    +

    Methods

    +
    +
    +def is_out_of_control(self, values: List[float]) +
    +
    +
    +
    + +Expand source code + +
    def is_out_of_control(self, values: List[float]):
    +    idxmin = np.argmin(values)
    +
    +    if idxmin == len(values):  # score is monotonically decreasing
    +        return False
    +
    +    right_maxval = max(values[idxmin:])
    +    minval = values[idxmin]
    +
    +    if minval <= 0:
    +        raise ValueError(
    +            f'Score "{self.score_name}" has min_value = {minval} which is <= 0.'
    +            f' This control scheme is using to control scores acting like Perplexity.'
    +            f' Ensure you control the Perplexity score or write your own controller!'
    +        )
    +
    +    answer = (right_maxval - minval) / minval > self.fraction_threshold
    +
    +    if answer:
    +        return OutOfControlAnswer(
    +            answer=answer,
    +            error_message=(
    +                f"Score {self.score_name} is too high!"
    +                f" Right max value: {right_maxval}, min value: {minval}"
    +            ),
    +        )
    +
    +    return OutOfControlAnswer(answer=answer)
    class RegularizationControllerCube -(num_iter, parameters, reg_search='grid', use_relative_coefficients=True, strategy=None, tracked_score_function=None, verbose=False, separate_thread=True) +(num_iter: int, parameters, reg_search='grid', use_relative_coefficients: bool = True, strategy=None, tracked_score_function=None, verbose: bool = False, separate_thread: bool = True)
    -

    Abstract class for all cubes.

    +

    Abstract class for all cubes.

    Initialize stage. Checks params and update internal attributes.

    Parameters

    num_iter : int
    number of iterations or method
    -
    parameters : list[dict] or dict
    +
    parameters : list[dict] or dict

    regularizers params each dict should contain the following fields: ("reg_name" or "regularizer"), -"score_to_track" (optional), "tau_converter", +"score_to_track" (optional), +"fraction_threshold" (optional), +"score_controller" (optional), "user_value_grid" See top-level docstring for details. Examples:

        >>  {"regularizer": artm.regularizers.<...>,
    -    >>   "score_to_track": "PerplexityScore@all",
         >>   "tau_converter": "prev_tau * user_value",
    +    >>   "score_to_track": "PerplexityScore@all",
    +    >>   "fraction_threshold": 0.1,
         >>   "user_value_grid": [0.5, 1, 2]}
     

        >>  {"reg_name": "decorrelator_for_ngramms",
    -    >>   "score_to_track": None,
         >>   "tau_converter": (
         >>       lambda initial_tau, prev_tau, cur_iter, user_value:
         >>       initial_tau * (cur_iter % 2) + user_value
         >>   )
    +    >>   "score_to_track": None,
    +    >>   "fraction_threshold": None,
    +    >>   "score_controller": [
    +    >>       PerplexityScoreController("PerplexityScore@all", 0.1)
    +    >>   ],
         >>   "user_value_grid": [0, 1]}
     
    @@ -874,13 +1331,15 @@

    Parameters

    i.e. normalized over collection properties
    strategy : BaseStrategy
    optimization approach (Default value = None)
    -
    tracked_score_function : str ot callable
    +
    tracked_score_function : str ot callable
    optimizable function for strategy (Default value = None)
    verbose : bool
    visualization flag (Default value = False)
    - +
    -Source code + +Expand source code +
    class RegularizationControllerCube(BaseCube):
         def __init__(self, num_iter: int, parameters,
                      reg_search='grid', use_relative_coefficients: bool = True, strategy=None,
    @@ -896,26 +1355,33 @@ 

    Parameters

    regularizers params each dict should contain the following fields: ("reg_name" or "regularizer"), - "score_to_track" (optional), "tau_converter", + "score_to_track" (optional), + "fraction_threshold" (optional), + "score_controller" (optional), "user_value_grid" See top-level docstring for details. Examples: >> {"regularizer": artm.regularizers.<...>, - >> "score_to_track": "PerplexityScore@all", >> "tau_converter": "prev_tau * user_value", + >> "score_to_track": "PerplexityScore@all", + >> "fraction_threshold": 0.1, >> "user_value_grid": [0.5, 1, 2]} ----------- >> {"reg_name": "decorrelator_for_ngramms", - >> "score_to_track": None, >> "tau_converter": ( >> lambda initial_tau, prev_tau, cur_iter, user_value: >> initial_tau * (cur_iter % 2) + user_value >> ) + >> "score_to_track": None, + >> "fraction_threshold": None, + >> "score_controller": [ + >> PerplexityScoreController("PerplexityScore@all", 0.1) + >> ], >> "user_value_grid": [0, 1]} reg_search : str @@ -942,9 +1408,9 @@

    Parameters

    separate_thread=separate_thread) self._relative = use_relative_coefficients self.data_stats = None - self.raw_parameters = parameters if isinstance(parameters, dict): parameters = [parameters] + self.raw_parameters = parameters self._convert_parameters(parameters) def _convert_parameters(self, all_parameters): @@ -1005,22 +1471,29 @@

    Parameters

    for (agent_blueprint_template, field_name, current_user_value) in one_model_parameter: agent_blueprint = dict(agent_blueprint_template) - if agent_blueprint["reg_name"] is None: - regularizer = agent_blueprint["regularizer"] - new_regularizer = deepcopy(regularizer) - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) - agent_blueprint["reg_name"] = new_regularizer.name - else: - if agent_blueprint['reg_name'] not in new_model.regularizers.data: + if agent_blueprint.get("reg_name") is not None: + reg_name = agent_blueprint['reg_name'] + + if reg_name not in new_model.all_regularizers: error_msg = (f"Regularizer {agent_blueprint['reg_name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif agent_blueprint.get("regularizer") is not None: + regularizer = agent_blueprint["regularizer"] + new_regularizer = deepcopy(regularizer) + if isinstance(regularizer, BaseRegularizer): + new_model.custom_regularizers[new_regularizer.name] = new_regularizer + else: # classic bigARTM regularizer, attempt to relativize it's coefficients + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) + agent_blueprint["reg_name"] = new_regularizer.name + else: + raise ValueError("Either 'reg_name' or 'regularizer' should be set") agent_blueprint['local_dict']['user_value'] = current_user_value # ControllerAgent needs only reg_name in constructor agent_blueprint.pop("regularizer") @@ -1058,7 +1531,7 @@

    Methods

    def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None)
    -

    Applies regularizers and controller agents to model

    +

    Applies regularizers and controller agents to model

    Parameters

    topic_model : TopicModel
    @@ -1074,9 +1547,11 @@

    Returns

    TopicModel
     
    -
    +
    -Source code + +Expand source code +
    def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None):
         """
         Applies regularizers and controller agents to model
    @@ -1106,22 +1581,29 @@ 

    Returns

    for (agent_blueprint_template, field_name, current_user_value) in one_model_parameter: agent_blueprint = dict(agent_blueprint_template) - if agent_blueprint["reg_name"] is None: - regularizer = agent_blueprint["regularizer"] - new_regularizer = deepcopy(regularizer) - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) - agent_blueprint["reg_name"] = new_regularizer.name - else: - if agent_blueprint['reg_name'] not in new_model.regularizers.data: + if agent_blueprint.get("reg_name") is not None: + reg_name = agent_blueprint['reg_name'] + + if reg_name not in new_model.all_regularizers: error_msg = (f"Regularizer {agent_blueprint['reg_name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif agent_blueprint.get("regularizer") is not None: + regularizer = agent_blueprint["regularizer"] + new_regularizer = deepcopy(regularizer) + if isinstance(regularizer, BaseRegularizer): + new_model.custom_regularizers[new_regularizer.name] = new_regularizer + else: # classic bigARTM regularizer, attempt to relativize it's coefficients + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) + agent_blueprint["reg_name"] = new_regularizer.name + else: + raise ValueError("Either 'reg_name' or 'regularizer' should be set") agent_blueprint['local_dict']['user_value'] = current_user_value # ControllerAgent needs only reg_name in constructor agent_blueprint.pop("regularizer") @@ -1156,13 +1638,15 @@

    Index

  • topicnet.cooking_machine.cubes
  • -
  • Functions

    +
  • Classes

    +
      +
    • +

      BaseScoreController

    • -
    • Classes

      -
      • ControllerAgent

          @@ -1170,6 +1654,19 @@

          OutOfControlAnswer

          + + +
        • +

          PerplexityScoreController

          + +
        • +
        • RegularizationControllerCube

          • apply
          • @@ -1181,7 +1678,7 @@

            -

            Generated by pdoc 0.6.3.

            +

            Generated by pdoc 0.8.1.

            diff --git a/docs/cooking_machine/cubes/cube_creator.html b/docs/cooking_machine/cubes/cube_creator.html index 34d1188..609589c 100644 --- a/docs/cooking_machine/cubes/cube_creator.html +++ b/docs/cooking_machine/cubes/cube_creator.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.cube_creator API documentation - - + + @@ -21,7 +21,9 @@

            Module topicnet.cooking_machine.cubes.cube_creator
            -Source code + +Expand source code +
            from .base_cube import BaseCube
             from inspect import signature
             from copy import deepcopy
            @@ -265,17 +267,17 @@ 

            Classes

            class CubeCreator -(num_iter, parameters, reg_search='grid', strategy=None, model_class='TopicModel', second_level=False, tracked_score_function=None, verbose=False, separate_thread=True) +(num_iter: int, parameters, reg_search='grid', strategy=None, model_class='TopicModel', second_level=False, tracked_score_function=None, verbose=False, separate_thread=True)
            -

            Class for creating models with different initial parameters.

            +

            Class for creating models with different initial parameters.

            Parameters

            model : TopicModel
            TopicModel instance
            num_iter : int
            number of iterations or method
            -
            parameters : list[dict] or dict
            +
            parameters : list[dict] or dict
            parameters for model initialization
            reg_search : str
            "grid" or "pair"
            @@ -289,9 +291,11 @@

            Parameters

            visualization flag (Default value = False)
            separate_thread : bool
            will train models inside a separate thread if True
            -
            +
            -Source code + +Expand source code +
            class CubeCreator(BaseCube):
                 """
                 Class for creating models with different initial parameters.
            @@ -525,7 +529,7 @@ 

            Class variables

            var DEFAULT_SEED_VALUE
            -
            +

            Methods

            @@ -534,7 +538,7 @@

            Methods

            def apply(self, topic_model, one_cube_parameter, dictionary=None, model_id=None)
            -

            Parameters

            +

            Parameters

            topic_model : TopicModel
             
            @@ -545,9 +549,11 @@

            Methods

            model_id : str
            (Default value = None)
            -

            Returns

            +

            Returns

            -Source code + +Expand source code +
            def apply(self, topic_model, one_cube_parameter, dictionary=None, model_id=None):
                 """
             
            @@ -642,7 +648,7 @@ 

            diff --git a/docs/cooking_machine/cubes/greedy_strategy.html b/docs/cooking_machine/cubes/greedy_strategy.html index ba6a784..903ef72 100644 --- a/docs/cooking_machine/cubes/greedy_strategy.html +++ b/docs/cooking_machine/cubes/greedy_strategy.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.greedy_strategy API documentation - - + + @@ -21,7 +21,9 @@

            Module topicnet.cooking_machine.cubes.greedy_strategy
            -Source code + +Expand source code +
            import numpy as np
             from .strategy import BaseStrategy
             
            @@ -203,19 +205,21 @@ 

            Classes

            class GreedyStrategy -(renormalize=False) +(renormalize: bool = False)
            -

            Allows to visit nodes of parameters' grid in a particular order.

            +

            Allows to visit nodes of parameters' grid in a particular order.

            The rough idea:
            We are given grid of (values1 x values2 x values3).
            This strategy will find best value among points of form [v1, 0, 0] and will mark first coordinate as finished.
            Then we search for best v2 among [v1, v2, 0].
            Then [v1, v2, v3] etc.

            -

            Initialize stage. Updates internal attributes.

            +

            Initialize stage. Updates internal attributes.

            -Source code + +Expand source code +
            class GreedyStrategy(BaseStrategy):
                 """
                 Allows to visit nodes of parameters' grid in a particular order.
            @@ -391,7 +395,7 @@ 

            Methods

            def grid_visit_generator(self, other_parameters, reg_search)
            -

            Converts the search point given to the internal format +

            Converts the search point given to the internal format Notably, pads with zero and normalizees with some rudimentary sanity checking.

            Parameters

            @@ -405,9 +409,11 @@

            Yields

            list of lists
             
            -
            +
            -Source code + +Expand source code +
            def grid_visit_generator(self, other_parameters, reg_search):
                 """
                 Converts the search point given to the internal format
            @@ -447,16 +453,18 @@ 

            Yields

            def prepare_grid(self, other_parameters, reg_search)
            -

            Sets parameters of grid and prepares grid length for verbosity.

            +

            Sets parameters of grid and prepares grid length for verbosity.

            Parameters

            other_parameters : dict or list of dict
             
            reg_search : str
             
            -
            +
            -Source code + +Expand source code +
            def prepare_grid(self, other_parameters, reg_search):
                 """
                 Sets parameters of grid and prepares grid length for verbosity.
            @@ -510,7 +518,7 @@ 

            -

            Generated by pdoc 0.6.3.

            +

            Generated by pdoc 0.8.1.

            diff --git a/docs/cooking_machine/cubes/index.html b/docs/cooking_machine/cubes/index.html index be868bf..8adfcd7 100644 --- a/docs/cooking_machine/cubes/index.html +++ b/docs/cooking_machine/cubes/index.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes API documentation - - + + @@ -19,44 +19,11 @@

            Module topicnet.cooking_machine.cubes

            -

            Cubes and their Strategies

            -

            Cube types:

            -
              -
            • BaseCube — a parent class for all the Cubes
            • -
            • RegularizersModifierCube — cube that adds or alter model regularizers
            • -
            • CubeCreator — cube that allows to change model fundamental hyperparameters (topic number)
            • -
            • RegularizationControllerCube - cube that ties together a complicated usage of RegularizersModifierCube. This cube allows for change of regularization coefficients across the model training. This allows to obtain soemwhat unique results by combining contradictionary restrictions on the model.
            • -
            -
            -

            Strategy types:

            -
              -
            • BaseStrategy — a parent class for all the Strategies
            • -
            • PerplexityStrategy — performs search in given hyperparameter space until certain score exceeds a boundary
            • -
            • GreedyStrategy — strategy that performes search in hyperparameter space consequently changing dimensions to perform a 1D search for a minimum
            • -
            -
            -

            Cube internal structure

            -

            The main cube attributes:

            -
              -
            • parameters — paramteres is an iterable object containing all the specific information about current cube. The class architecture implies that parameters should contain an iterable field describing the hyperparameters search space
            • -
            -

            Cube methods worth noticing:

            -
              -
            • __call__ — performes the cube actions to the model using provided dataset. Always recieves instance of TopicModel class and instance of Dataset class. This method does the internal workings of training models with new hyperparameters. It is responsible for logging the events (which parameters where changed) happening during the model training.

            • -
            • apply — method of the cube that prepares model for further training. This method should be specified by the user as it contains an “essence” of what is happening at this stage of the training. It could be new type of model reinitialization, change of the regualarization coefficient, adding a new level of hierarchy etc. This function defines what the cube does in the training pipeline.

            • -
            • get_jsonable_from_parameters — is a cube-specific function that transforms it parameters to dict-like form which later is written in JSON format log of the experiment.

            • -
            -
            -

            What do you need to create a new cube?

            -

            Following this 3 easy steps you will be able to write down your own cube:

            -
              -
            1. Inherit your Cube from BaseCube.

            2. -
            3. Child class should define following methods __init__, apply, get_jsonable_from_parameters. It is strongly descouraged to change __call__ method.

            4. -
            5. get_jsonable_from_parameters()[i] corresponds to the same cube step as parameters[i].

            6. -
            -Source code + +Expand source code +
            from .base_cube import BaseCube, retrieve_score_for_strategy
             from .regularizer_cube import RegularizersModifierCube
             from .controller_cube import RegularizationControllerCube
            @@ -71,32 +38,32 @@ 

            Sub-modules

            topicnet.cooking_machine.cubes.base_cube
            -
            +
            topicnet.cooking_machine.cubes.controller_cube
            -

            Allows to add ControllerAgent (with unknown parameters) to the model, which enables user to -change tau during the _fit method …

            +

            Allows to add ControllerAgent (with unknown parameters) to the model, which enables user to +change tau during the _fit method …

            topicnet.cooking_machine.cubes.cube_creator
            -
            +
            topicnet.cooking_machine.cubes.greedy_strategy
            -
            +
            topicnet.cooking_machine.cubes.perplexity_strategy
            -
            +
            topicnet.cooking_machine.cubes.regularizer_cube
            -
            +
            topicnet.cooking_machine.cubes.strategy
            -
            +
            @@ -108,7 +75,7 @@

            Sub-modules

            diff --git a/docs/cooking_machine/cubes/perplexity_strategy.html b/docs/cooking_machine/cubes/perplexity_strategy.html index 3e35ffd..5e10a12 100644 --- a/docs/cooking_machine/cubes/perplexity_strategy.html +++ b/docs/cooking_machine/cubes/perplexity_strategy.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.perplexity_strategy API documentation - - + + @@ -21,7 +21,9 @@

            Module topicnet.cooking_machine.cubes.perplexity_strateg
            -Source code + +Expand source code +
            import numpy as np
             from itertools import product
             import warnings
            @@ -201,7 +203,7 @@ 

            Module topicnet.cooking_machine.cubes.perplexity_strateg self.grid = product(*all_coeffs_grid) self.grid_len = len(all_coeffs_grid[0]) if self.grid is None: - raise ValueError(f'Failed to initialize self.grid, check initial parameters.') + raise ValueError('Failed to initialize self.grid, check initial parameters.') def grid_visit_generator(self, other_parameters, reg_search): """ @@ -263,10 +265,10 @@

            Classes

            class PerplexityStrategy -(start_point=None, step=None, max_len=25, threshold=1.05) +(start_point: float = None, step: float = None, max_len: float = 25, threshold: float = 1.05)
            -

            Search for the best perplexity score.

            +

            Search for the best perplexity score.

            Initialize stage.

            Parameters

            @@ -279,9 +281,11 @@

            Parameters

            length of progression
            threshold : float
            threshold for "perplexity out of control"
            -
            +
            -Source code + +Expand source code +
            class PerplexityStrategy(BaseStrategy):
                 """
                 Search for the best perplexity score.
            @@ -454,7 +458,7 @@ 

            Parameters

            self.grid = product(*all_coeffs_grid) self.grid_len = len(all_coeffs_grid[0]) if self.grid is None: - raise ValueError(f'Failed to initialize self.grid, check initial parameters.') + raise ValueError('Failed to initialize self.grid, check initial parameters.') def grid_visit_generator(self, other_parameters, reg_search): """ @@ -514,7 +518,7 @@

            Methods

            def grid_visit_generator(self, other_parameters, reg_search)
            -

            Yields points from search space with sanity checking of current result.

            +

            Yields points from search space with sanity checking of current result.

            Parameters

            other_parameters : dict
            @@ -524,11 +528,13 @@

            Parameters

            Yields

            -
            sequence of points in search space
            +
            sequence of points in search space
             
            -
            +
            -Source code + +Expand source code +
            def grid_visit_generator(self, other_parameters, reg_search):
                 """
                 Yields points from search space with sanity checking of current result.
            @@ -570,7 +576,7 @@ 

            Yields

            def prepare_grid(self, other_parameters, reg_search='add')
            -

            Creates search space and length for tqdm. +

            Creates search space and length for tqdm. Note, that first point in sequence is always 0.

            Parameters

            @@ -579,9 +585,11 @@

            Parameters

            reg_search : str
            "grid", "add" or "mul" defines grid search or arithmetic or geometric progression
            -
            +
            -Source code + +Expand source code +
            def prepare_grid(self, other_parameters, reg_search="add"):
                 """
                 Creates search space and length for tqdm.
            @@ -619,7 +627,7 @@ 

            Parameters

            self.grid = product(*all_coeffs_grid) self.grid_len = len(all_coeffs_grid[0]) if self.grid is None: - raise ValueError(f'Failed to initialize self.grid, check initial parameters.')
            + raise ValueError('Failed to initialize self.grid, check initial parameters.')
            @@ -661,7 +669,7 @@

            -

            Generated by pdoc 0.6.3.

            +

            Generated by pdoc 0.8.1.

            diff --git a/docs/cooking_machine/cubes/regularizer_cube.html b/docs/cooking_machine/cubes/regularizer_cube.html index ab481e6..30a94fa 100644 --- a/docs/cooking_machine/cubes/regularizer_cube.html +++ b/docs/cooking_machine/cubes/regularizer_cube.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.regularizer_cube API documentation - - + + @@ -21,10 +21,13 @@

            Module topicnet.cooking_machine.cubes.regularizer_cube
            -Source code + +Expand source code +
            from .base_cube import BaseCube
             from ..routine import transform_complex_entity_to_dict
             from ..rel_toolbox_lite import count_vocab_size, handle_regularizer
            +from ..models.base_regularizer import BaseRegularizer
             from copy import deepcopy
             
             
            @@ -168,18 +171,28 @@ 

            Module topicnet.cooking_machine.cubes.regularizer_cubeClasses

            class RegularizersModifierCube -(num_iter, regularizer_parameters, reg_search='grid', use_relative_coefficients=True, strategy=None, tracked_score_function=None, verbose=False, separate_thread=True) +(num_iter: int, regularizer_parameters, reg_search='grid', use_relative_coefficients: bool = True, strategy=None, tracked_score_function=None, verbose: bool = False, separate_thread: bool = True)
            -

            Allows to create cubes of training and apply them to a topic model.

            +

            Allows to create cubes of training and apply them to a topic model.

            Initialize stage. Checks params and update internal attributes.

            Parameters

            num_iter : int
            number of iterations or method
            -
            regularizer_parameters : list[dict] or dict
            +
            regularizer_parameters : list[dict] or dict
            regularizers params
            reg_search : str
            "grid", "pair", "add" or "mul". @@ -251,9 +264,11 @@

            Parameters

            visualization flag (Default value = False)
            separate_thread : bool
            will train models inside a separate thread if True
            -
            +
            -Source code + +Expand source code +
            class RegularizersModifierCube(BaseCube):
                 """
                 Allows to create cubes of training and apply them to a topic model.
            @@ -394,18 +409,28 @@ 

            Parameters

            regularizer_type = str(type(regularizer)) if isinstance(regularizer, dict): if regularizer['name'] in new_model.all_regularizers.keys(): + # TODO: do we actually need to deepcopy custom regularizers? new_regularizer = deepcopy(new_model.all_regularizers[regularizer['name']]) - new_regularizer._tau = params - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) + if regularizer['name'] in new_model.custom_regularizers: + new_model.custom_regularizers[regularizer['name']].tau = params + else: + # if this is classic regularizer, we attempt to relativize it's coefficients + new_regularizer._tau = params + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) else: error_msg = (f"Regularizer {regularizer['name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif isinstance(regularizer, BaseRegularizer): + # TODO: do we actually need to deepcopy here? + new_regularizer = deepcopy(regularizer) + new_regularizer.tau = params + new_model.custom_regularizers[regularizer.name] = new_regularizer elif 'Regularizer' in regularizer_type: new_regularizer = deepcopy(regularizer) new_regularizer._tau = params @@ -446,7 +471,7 @@

            Methods

            def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None)
            -

            Applies regularizers and parameters to model

            +

            Applies regularizers and parameters to model

            Parameters

            topic_model : TopicModel
            @@ -462,9 +487,11 @@

            Returns

            TopicModel
             
            -
            +
            -Source code + +Expand source code +
            def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None):
                 """
                 Applies regularizers and parameters to model
            @@ -498,18 +525,28 @@ 

            Returns

            regularizer_type = str(type(regularizer)) if isinstance(regularizer, dict): if regularizer['name'] in new_model.all_regularizers.keys(): + # TODO: do we actually need to deepcopy custom regularizers? new_regularizer = deepcopy(new_model.all_regularizers[regularizer['name']]) - new_regularizer._tau = params - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) + if regularizer['name'] in new_model.custom_regularizers: + new_model.custom_regularizers[regularizer['name']].tau = params + else: + # if this is classic regularizer, we attempt to relativize it's coefficients + new_regularizer._tau = params + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) else: error_msg = (f"Regularizer {regularizer['name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif isinstance(regularizer, BaseRegularizer): + # TODO: do we actually need to deepcopy here? + new_regularizer = deepcopy(regularizer) + new_regularizer.tau = params + new_model.custom_regularizers[regularizer.name] = new_regularizer elif 'Regularizer' in regularizer_type: new_regularizer = deepcopy(regularizer) new_regularizer._tau = params @@ -563,7 +600,7 @@

            -

            Generated by pdoc 0.6.3.

            +

            Generated by pdoc 0.8.1.

            diff --git a/docs/cooking_machine/cubes/strategy.html b/docs/cooking_machine/cubes/strategy.html index f64f6ca..884f293 100644 --- a/docs/cooking_machine/cubes/strategy.html +++ b/docs/cooking_machine/cubes/strategy.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.cubes.strategy API documentation - - + + @@ -21,7 +21,9 @@

            Module topicnet.cooking_machine.cubes.strategy
            -Source code + +Expand source code +
            from itertools import product
             from functools import reduce
             from operator import mul
            @@ -145,10 +147,12 @@ 

            Classes

            class BaseStrategy
            -

            Allows to visit nodes of parameters' grid in a particular order.

            -

            Initialize stage. Checks params and update internal attributes.

            +

            Allows to visit nodes of parameters' grid in a particular order.

            +

            Initialize stage. Checks params and update internal attributes.

            -Source code + +Expand source code +
            class BaseStrategy():
                 """
                 Allows to visit nodes of parameters' grid in a particular order.
            @@ -255,8 +259,8 @@ 

            Classes

            Subclasses

            Methods

            @@ -264,7 +268,7 @@

            Methods

            def grid_visit_generator(self, other_parameters, reg_search)
            -

            Parameters

            +

            Parameters

            other_parameters : dict or list of dict
             
            @@ -275,9 +279,11 @@

            Yields

            list or tuple
            one parameters set for model
            -
            +
            -Source code + +Expand source code +
            def grid_visit_generator(self, other_parameters, reg_search):
                 """
             
            @@ -300,16 +306,18 @@ 

            Yields

            def prepare_grid(self, other_parameters, reg_search)
            -

            Creates grid for the search. Inplace.

            +

            Creates grid for the search. Inplace.

            Parameters

            other_parameters : dict or list of dict
             
            reg_search : str
            "grid" or "pair" (and "add" or "mul" for perplexity)
            -
            +
            -Source code + +Expand source code +
            def prepare_grid(self, other_parameters, reg_search):
                 """
                 Creates grid for the search. Inplace.
            @@ -339,13 +347,15 @@ 

            Parameters

            def update_scores(self, new_value)
            -

            Parameters

            +

            Parameters

            new_value : float
             
            -
            +
            -Source code + +Expand source code +
            def update_scores(self, new_value):
                 """
             
            @@ -389,7 +399,7 @@ 

            -

            Generated by pdoc 0.6.3.

            +

            Generated by pdoc 0.8.1.

            diff --git a/docs/cooking_machine/dataset.html b/docs/cooking_machine/dataset.html index 9d5c255..23b26cf 100644 --- a/docs/cooking_machine/dataset.html +++ b/docs/cooking_machine/dataset.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.dataset API documentation - - + + @@ -21,13 +21,14 @@

            Module topicnet.cooking_machine.dataset

            -Source code + +Expand source code +
            import csv
             import os
             import pandas as pd
             import shutil
             import sys
            -import tempfile
             import warnings
             
             from glob import glob
            @@ -427,6 +428,7 @@ 

            Module topicnet.cooking_machine.dataset

            """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) + return cls(data_path=data_path, **kwargs) def get_dataset(self): @@ -582,17 +584,16 @@

            Module topicnet.cooking_machine.dataset

            return False, path_to_collection if self._data_hash is None: - temp_file_descriptor, temp_file_path = tempfile.mkstemp( - prefix='temp_vw__', - suffix='.txt', - dir=self._internals_folder_path + temp_file_path = os.path.join( + self._internals_folder_path, 'temp_vw.txt' ) - self.write_vw(temp_file_path) - self._data_hash = blake2bchecksum(temp_file_path) - - os.close(temp_file_descriptor) - os.remove(temp_file_path) + try: + self.write_vw(temp_file_path) + self._data_hash = blake2bchecksum(temp_file_path) + finally: + if os.path.isfile(temp_file_path): + os.remove(temp_file_path) if os.path.isfile(path_to_collection): same_collection = blake2bchecksum(path_to_collection) == self._data_hash @@ -744,7 +745,7 @@

            Functions

            def get_modality_names(vw_string)
            -

            Gets modality names from vw_string.

            +

            Gets modality names from vw_string.

            Parameters

            vw_string : str
            @@ -756,9 +757,11 @@

            Returns

            document id
            list of str
            modalities in document
            -
            +
            -Source code + +Expand source code +
            def get_modality_names(vw_string):
                 """
                 Gets modality names from vw_string.
            @@ -788,7 +791,7 @@ 

            Returns

            def get_modality_vw(vw_string, modality_name)
            -

            Gets modality string from document vw string.

            +

            Gets modality string from document vw string.

            Parameters

            vw_string : str
            @@ -800,9 +803,11 @@

            Returns

            str
            content of modality_name modality
            -
            +
            -Source code + +Expand source code +
            def get_modality_vw(vw_string, modality_name):
                 """
                 Gets modality string from document vw string.
            @@ -836,12 +841,13 @@ 

            Classes

            class BaseDataset -(*args, **kwargs)
            -
            +
            -Source code + +Expand source code +
            class BaseDataset:
                 """ """
                 def get_source_document(self, document_id):
            @@ -868,13 +874,15 @@ 

            Methods

            def get_source_document(self, document_id)
            -

            Parameters

            +

            Parameters

            document_id : str
             
            -
            +
            -Source code + +Expand source code +
            def get_source_document(self, document_id):
                 """
             
            @@ -890,10 +898,10 @@ 

            Methods

            class Dataset -(data_path, keep_in_memory=True, batch_vectorizer_path=None, internals_folder_path=None, batch_size=1000) +(data_path: str, keep_in_memory: bool = True, batch_vectorizer_path: str = None, internals_folder_path: str = None, batch_size: int = 1000)
            -

            Class for keeping training data and documents for creation models.

            +

            Class for keeping training data and documents for creation models.

            Parameters

            data_path : str
            @@ -965,9 +973,11 @@

            Notes

          It is also worth emphasizing that, if the text collection is big, Theta matrix may not fit in memory. -So, in this case, some BigARTM scores (which depend on Theta) will stop working.

          +So, in this case, some BigARTM scores (which depend on Theta) will stop working.

          -Source code + +Expand source code +
          class Dataset(BaseDataset):
               """
               Class for keeping training data and documents for creation models.
          @@ -1256,6 +1266,7 @@ 

          Notes

          """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) + return cls(data_path=data_path, **kwargs) def get_dataset(self): @@ -1411,17 +1422,16 @@

          Notes

          return False, path_to_collection if self._data_hash is None: - temp_file_descriptor, temp_file_path = tempfile.mkstemp( - prefix='temp_vw__', - suffix='.txt', - dir=self._internals_folder_path + temp_file_path = os.path.join( + self._internals_folder_path, 'temp_vw.txt' ) - self.write_vw(temp_file_path) - self._data_hash = blake2bchecksum(temp_file_path) - - os.close(temp_file_descriptor) - os.remove(temp_file_path) + try: + self.write_vw(temp_file_path) + self._data_hash = blake2bchecksum(temp_file_path) + finally: + if os.path.isfile(temp_file_path): + os.remove(temp_file_path) if os.path.isfile(path_to_collection): same_collection = blake2bchecksum(path_to_collection) == self._data_hash @@ -1572,10 +1582,10 @@

          Subclasses

          Static methods

          -def from_dataframe(dataframe, save_dataset_path, dataframe_name='dataset', **kwargs) +def from_dataframe(dataframe: pandas.core.frame.DataFrame, save_dataset_path: str, dataframe_name: str = 'dataset', **kwargs) -> Dataset
          -

          Creates dataset from pd.DataFrame +

          Creates dataset from pd.DataFrame reuqires to specify technical folder for dataset files

          Parameters

          @@ -1588,12 +1598,12 @@

          Parameters

          name for the dataset file to be saved in csv format Another Parameters


          -
          -
          **kwargs
          -
          kwargs are optional init topicnet.Dataset parameters
          -
          +

          kwargs +kwargs are optional init topicnet.Dataset parameters

          -Source code + +Expand source code +
          @classmethod
           def from_dataframe(
               cls,
          @@ -1621,17 +1631,20 @@ 

          Parameters

          """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) + return cls(data_path=data_path, **kwargs)

          Instance variables

          -
          var documents
          +
          var documents : List[str]
          -
          +
          -Source code + +Expand source code +
          @property
           def documents(self) -> List[str]:
               return list(self._data_index)
          @@ -1644,9 +1657,11 @@

          Methods

          def clear_batches_folder(self)
          -

          Clear batches folder

          +

          Clear batches folder

          -Source code + +Expand source code +
          def clear_batches_folder(self):
               """
               Clear batches folder
          @@ -1662,9 +1677,11 @@ 

          Methods

          def clear_folder(self)
          -

          Clear internals_folder_path

          +

          Clear internals_folder_path

          -Source code + +Expand source code +
          def clear_folder(self):
               """
               Clear internals_folder_path
          @@ -1678,17 +1695,19 @@ 

          Methods

          -def get_batch_vectorizer(self) +def get_batch_vectorizer(self) -> artm.batches_utils.BatchVectorizer
          -

          Gets batch vectorizer.

          +

          Gets batch vectorizer.

          Returns

          artm.BatchVectorizer
           
          -
          +
          -Source code + +Expand source code +
          def get_batch_vectorizer(self) -> artm.BatchVectorizer:
               """
               Gets batch vectorizer.
          @@ -1736,26 +1755,30 @@ 

          Returns

          def get_dataset(self)
          -
          +
          -Source code + +Expand source code +
          def get_dataset(self):
               """ """
               return self._data
          -def get_dictionary(self) +def get_dictionary(self) -> artm.dictionary.Dictionary
          -

          Gets dataset's dictionary.

          +

          Gets dataset's dictionary.

          Returns

          artm.Dictionary
           
          -
          +
          -Source code + +Expand source code +
          def get_dictionary(self) -> artm.Dictionary:
               """
               Gets dataset's dictionary.
          @@ -1797,14 +1820,16 @@ 

          Returns

          def get_possible_modalities(self)
          -

          Returns extracted modalities.

          +

          Returns extracted modalities.

          Returns

          set
          all modalities in Dataset
          -
          +
          -Source code + +Expand source code +
          def get_possible_modalities(self):
               """
               Returns extracted modalities.
          @@ -1819,10 +1844,10 @@ 

          Returns

          -def get_source_document(self, document_id) +def get_source_document(self, document_id: str) -> pandas.core.frame.DataFrame
          -

          Get 'raw_text' for the document with document_id.

          +

          Get 'raw_text' for the document with document_id.

          Parameters

          document_id
          @@ -1832,9 +1857,11 @@

          Returns

          pd.DataFrame
          document_id and content of 'raw_text' column
          -
          +
          -Source code + +Expand source code +
          def get_source_document(self, document_id: str or List[str]) -> pd.DataFrame:
               """
               Get 'raw_text' for the document with `document_id`.
          @@ -1882,10 +1909,10 @@ 

          Returns

          -def get_vw_document(self, document_id) +def get_vw_document(self, document_id: str) -> pandas.core.frame.DataFrame
          -

          Get 'vw_text' for the document with document_id.

          +

          Get 'vw_text' for the document with document_id.

          Parameters

          document_id
          @@ -1895,9 +1922,11 @@

          Returns

          pd.DataFrame
          document_id and content of 'vw_text' column
          -
          +
          -Source code + +Expand source code +
          def get_vw_document(self, document_id: str or List[str]) -> pd.DataFrame:
               """
               Get 'vw_text' for the document with `document_id`.
          @@ -1945,12 +1974,14 @@ 

          Returns

          -def write_vw(self, file_path) +def write_vw(self, file_path: str) -> NoneType
          -

          Saves dataset as text file in Vowpal Wabbit format

          +

          Saves dataset as text file in Vowpal Wabbit format

          -Source code + +Expand source code +
          def write_vw(self, file_path: str) -> None:
               """
               Saves dataset as text file in Vowpal Wabbit format
          @@ -2033,7 +2064,7 @@ 

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/dataset_cooc.html b/docs/cooking_machine/dataset_cooc.html index df452e2..0003134 100644 --- a/docs/cooking_machine/dataset_cooc.html +++ b/docs/cooking_machine/dataset_cooc.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.dataset_cooc API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.dataset_cooc

          -Source code + +Expand source code +
          from .dataset import Dataset
           import artm
           
          @@ -316,10 +318,10 @@ 

          Classes

          class DatasetCooc -(data_path, internals_folder_path=None, cooc_window=10, min_tf=5, min_df=5, threshold=2, **kwargs) +(data_path: str, internals_folder_path: str = None, cooc_window: int = 10, min_tf: int = 5, min_df: int = 5, threshold: int = 2, **kwargs)
          -

          Class prepare dataset in vw format for WNTM model

          +

          Class prepare dataset in vw format for WNTM model

          Parameters

          data_path : str
          @@ -360,9 +362,11 @@

          Parameters

          The frequency threshold above which the received pairs are selected to form the dataset
          -
          +
          -Source code + +Expand source code +
          class DatasetCooc(Dataset):
               """
               Class prepare dataset in vw format for WNTM model
          @@ -642,9 +646,11 @@ 

          Methods

          def clear_all_cooc_files(self)
          -

          Clear cooc_dir folder

          +

          Clear cooc_dir folder

          -Source code + +Expand source code +
          def clear_all_cooc_files(self):
               """
               Clear cooc_dir folder
          @@ -668,9 +674,11 @@ 

          Methods

          def transform_theta(self, model)
          -

          Transform theta matrix

          +

          Transform theta matrix

          -Source code + +Expand source code +
          def transform_theta(self, model):
               """
               Transform theta matrix
          @@ -753,7 +761,7 @@ 

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/experiment.html b/docs/cooking_machine/experiment.html index b45b81c..298e57d 100644 --- a/docs/cooking_machine/experiment.html +++ b/docs/cooking_machine/experiment.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.experiment API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.experiment

          -Source code + +Expand source code +
          import os
           import re
           import json
          @@ -481,7 +483,7 @@ 

          Module topicnet.cooking_machine.experiment

          Experiment """ - from .models import TopicModel + from .models import DummyTopicModel files = os.listdir(load_path) if "params.json" not in files: @@ -496,7 +498,7 @@

          Module topicnet.cooking_machine.experiment

          for model_id in experiment.models.keys(): if model_id != START: model_save_path = os.path.join(load_path, model_id) - experiment.models[model_id] = TopicModel.load( + experiment.models[model_id] = DummyTopicModel.load( model_save_path, experiment ) @@ -667,7 +669,7 @@

          Module topicnet.cooking_machine.experiment

          return [] - def run(self, dataset, verbose=False, nb_verbose=False): + def run(self, dataset, verbose=False, nb_verbose=False, restore_mode=False): # noqa C901 """ Runs defined pipeline and prints out the result. @@ -688,7 +690,22 @@

          Module topicnet.cooking_machine.experiment

          continue cube = cube_description['cube'] - cube(stage_models, dataset) + if not restore_mode: + cube(stage_models, dataset) + else: + if cube_index < self.depth - 1: + print(f"[Restoring experiment]: skipping cube {cube_index}") + continue + if cube_index == self.depth - 1: + print( + f"[Restoring experiment]: selecting models at cube number" + f"{cube_index} (some models could be lost)" + ) + if cube_index >= self.depth: + print( + f"[Restoring experiment]: applying cube number {cube_index}" + ) + cube(stage_models, dataset) # TODO: either delete this line completely # or come up with a way to restore any cube using just info about it in self.cubes @@ -827,7 +844,7 @@

          Module topicnet.cooking_machine.experiment

          req_equal, metric, extremum) = parse_query_string(inner_query_string) if metric is not None or extremum is not None: - warnings.warn(f'You try to optimize model parameters.') + warnings.warn('You try to optimize model parameters.') candidate_tmodels = self.get_models_by_depth(level=level) special_models = choose_best_models( @@ -874,6 +891,9 @@

          Module topicnet.cooking_machine.experiment

          try: self.cubes += [{ 'action': stage_cube.action, + # TODO: should it be 'params': cube_param instead? + # it seems that it is possible to restore failed + # experiment with load() that way..? 'params': stage_cube.get_jsonable_from_parameters(), 'cube': stage_cube }] @@ -918,10 +938,10 @@

          Classes

          class Experiment -(topic_model, experiment_id, save_path, save_model_history=False, save_experiment=True, tree=None, models_info=None, cubes=None, low_memory=False) +(topic_model, experiment_id: str, save_path: str, save_model_history: bool = False, save_experiment: bool = True, tree: dict = None, models_info: dict = None, cubes: List[dict] = None, low_memory: bool = False)
          -

          Contains experiment, its description and descriptions of all models in the experiment.

          +

          Contains experiment, its description and descriptions of all models in the experiment.

          Initialize stage, also used for loading and creating new experiments.

          Parameters

          @@ -954,9 +974,11 @@

          Parameters

          If one wants to use squeezed topic model as before (eg. call topic_model.get_phi()), its inner ARTM model should be restored first. See docstring for TopicModel.make_dummy() method for reference.
          -
          +
          -Source code + +Expand source code +
          class Experiment(object):
               """
               Contains experiment, its description and descriptions of all models in the experiment.
          @@ -1377,7 +1399,7 @@ 

          Parameters

          Experiment """ - from .models import TopicModel + from .models import DummyTopicModel files = os.listdir(load_path) if "params.json" not in files: @@ -1392,7 +1414,7 @@

          Parameters

          for model_id in experiment.models.keys(): if model_id != START: model_save_path = os.path.join(load_path, model_id) - experiment.models[model_id] = TopicModel.load( + experiment.models[model_id] = DummyTopicModel.load( model_save_path, experiment ) @@ -1563,7 +1585,7 @@

          Parameters

          return [] - def run(self, dataset, verbose=False, nb_verbose=False): + def run(self, dataset, verbose=False, nb_verbose=False, restore_mode=False): # noqa C901 """ Runs defined pipeline and prints out the result. @@ -1584,7 +1606,22 @@

          Parameters

          continue cube = cube_description['cube'] - cube(stage_models, dataset) + if not restore_mode: + cube(stage_models, dataset) + else: + if cube_index < self.depth - 1: + print(f"[Restoring experiment]: skipping cube {cube_index}") + continue + if cube_index == self.depth - 1: + print( + f"[Restoring experiment]: selecting models at cube number" + f"{cube_index} (some models could be lost)" + ) + if cube_index >= self.depth: + print( + f"[Restoring experiment]: applying cube number {cube_index}" + ) + cube(stage_models, dataset) # TODO: either delete this line completely # or come up with a way to restore any cube using just info about it in self.cubes @@ -1723,7 +1760,7 @@

          Parameters

          req_equal, metric, extremum) = parse_query_string(inner_query_string) if metric is not None or extremum is not None: - warnings.warn(f'You try to optimize model parameters.') + warnings.warn('You try to optimize model parameters.') candidate_tmodels = self.get_models_by_depth(level=level) special_models = choose_best_models( @@ -1770,6 +1807,9 @@

          Parameters

          try: self.cubes += [{ 'action': stage_cube.action, + # TODO: should it be 'params': cube_param instead? + # it seems that it is possible to restore failed + # experiment with load() that way..? 'params': stage_cube.get_jsonable_from_parameters(), 'cube': stage_cube }] @@ -1808,7 +1848,7 @@

          Static methods

          def load(load_path)
          -

          Loads all params of the experiments. Recovers removed files if it is possible.

          +

          Loads all params of the experiments. Recovers removed files if it is possible.

          Parameters

          load_path : str
          @@ -1816,11 +1856,13 @@

          Parameters

          Returns

          -
          Experiment
          +
          Experiment
           
          -
          +
          -Source code + +Expand source code +
          @staticmethod
           def load(load_path):
               """
          @@ -1836,7 +1878,7 @@ 

          Returns

          Experiment """ - from .models import TopicModel + from .models import DummyTopicModel files = os.listdir(load_path) if "params.json" not in files: @@ -1851,7 +1893,7 @@

          Returns

          for model_id in experiment.models.keys(): if model_id != START: model_save_path = os.path.join(load_path, model_id) - experiment.models[model_id] = TopicModel.load( + experiment.models[model_id] = DummyTopicModel.load( model_save_path, experiment ) @@ -1863,10 +1905,12 @@

          Instance variables

          var depth
          -

          Returns depth of the tree.
          -Be careful, depth of the tree may not be the real experiment depth.

          +

          Returns depth of the tree.
          +Be careful, depth of the tree may not be the real experiment depth.

          -Source code + +Expand source code +
          @property
           def depth(self):
               """
          @@ -1879,9 +1923,11 @@ 

          Instance variables

          var root
          -
          +
          -Source code + +Expand source code +
          @property
           def root(self):
               """ """
          @@ -1895,14 +1941,16 @@ 

          Methods

          def add_cube(self, cube)
          -

          Adds cube to the experiment.

          +

          Adds cube to the experiment.

          Parameters

          cube : dict
          cube's params
          -
          +
          -Source code + +Expand source code +
          def add_cube(self, cube):
               """
               Adds cube to the experiment.
          @@ -1922,16 +1970,18 @@ 

          Parameters

          def add_dataset(self, dataset_id, dataset)
          -

          Adds dataset to storage.

          +

          Adds dataset to storage.

          Parameters

          dataset_id : str
          id of dataset to save
          dataset : Dataset
           
          -
          +
          -Source code + +Expand source code +
          def add_dataset(self, dataset_id, dataset):
               """
               Adds dataset to storage.
          @@ -1953,14 +2003,16 @@ 

          Parameters

          def add_model(self, topic_model)
          -

          Adds model to the experiment.

          +

          Adds model to the experiment.

          Parameters

          topic_model : TopicModel
          topic model
          -
          +
          -Source code + +Expand source code +
          def add_model(self, topic_model):
               """
               Adds model to the experiment.
          @@ -1982,7 +2034,7 @@ 

          Parameters

          def build(self, settings)
          -

          Builds experiment pipeline from description.

          +

          Builds experiment pipeline from description.

          Parameters

          settings : list of dicts
          @@ -1992,9 +2044,11 @@

          Returns

          Nothing
           
          -
          +
          -Source code + +Expand source code +
          def build(self, settings):
               """
               Builds experiment pipeline from description.
          @@ -2026,6 +2080,9 @@ 

          Returns

          try: self.cubes += [{ 'action': stage_cube.action, + # TODO: should it be 'params': cube_param instead? + # it seems that it is possible to restore failed + # experiment with load() that way..? 'params': stage_cube.get_jsonable_from_parameters(), 'cube': stage_cube }] @@ -2039,7 +2096,7 @@

          Returns

          def describe_model(self, model_id)
          -

          Returns all scores mentioned on the model stage criteria.

          +

          Returns all scores mentioned on the model stage criteria.

          Parameters

          model_id : str
          @@ -2049,9 +2106,11 @@

          Returns

          description_string : str
           
          -
          +
          -Source code + +Expand source code +
          def describe_model(self, model_id):
               """
               Returns all scores mentioned on the model stage criteria.
          @@ -2090,10 +2149,10 @@ 

          Returns

          -def get_description(self, min_len_per_cube=26, len_tree_step=27) +def get_description(self, min_len_per_cube: int = 26, len_tree_step: int = 27)
          -

          Creates description of the tree that you can print. +

          Creates description of the tree that you can print. Print is good when you use no more than 3 cubes at all.

          Parameters

          @@ -2108,9 +2167,11 @@

          Returns

          str
          description to print
          -
          +
          -Source code + +Expand source code +
          def get_description(self,
                               min_len_per_cube: int = MODEL_NAME_LENGTH,
                               len_tree_step: int = MODEL_NAME_LENGTH + 1):
          @@ -2147,9 +2208,11 @@ 

          Returns

          def get_models_by_depth(self, level=None)
          -
          +
          -Source code + +Expand source code +
          def get_models_by_depth(self, level=None):
               """ """
               if level is None:
          @@ -2167,14 +2230,16 @@ 

          Returns

          def get_params(self)
          -

          Gets params of the experiment.

          +

          Gets params of the experiment.

          Returns

          parameters : dict
           
          -
          +
          -Source code + +Expand source code +
          def get_params(self):
               """
               Gets params of the experiment.
          @@ -2196,19 +2261,21 @@ 

          Returns

          -def preprocess_query(self, query_string, level) +def preprocess_query(self, query_string: str, level)
          -

          Preprocesses special queries with functions inside.

          +

          Preprocesses special queries with functions inside.

          Parameters

          query_string : str
          string for processing
          level : int
          model level
          -
          +
          -Source code + +Expand source code +
          def preprocess_query(self, query_string: str, level):
               """
               Preprocesses special queries with functions inside.
          @@ -2246,7 +2313,7 @@ 

          Parameters

          req_equal, metric, extremum) = parse_query_string(inner_query_string) if metric is not None or extremum is not None: - warnings.warn(f'You try to optimize model parameters.') + warnings.warn('You try to optimize model parameters.') candidate_tmodels = self.get_models_by_depth(level=level) special_models = choose_best_models( @@ -2267,14 +2334,16 @@

          Parameters

          def remove_dataset(self, dataset_id)
          -

          Removes dataset from storage.

          +

          Removes dataset from storage.

          Parameters

          dataset_id : str
          id of dataset to remove
          -
          +
          -Source code + +Expand source code +
          def remove_dataset(self, dataset_id):
               """
               Removes dataset from storage.
          @@ -2292,10 +2361,10 @@ 

          Parameters

          -def run(self, dataset, verbose=False, nb_verbose=False) +def run(self, dataset, verbose=False, nb_verbose=False, restore_mode=False)
          -

          Runs defined pipeline and prints out the result.

          +

          Runs defined pipeline and prints out the result.

          Parameters

          dataset : Dataset
          @@ -2305,10 +2374,12 @@

          Parameters

          nb_verbose : bool
          parameter that determines where the output is produced if False prints in console (Default value = False)
          -
          +
          -Source code -
          def run(self, dataset, verbose=False, nb_verbose=False):
          +
          +Expand source code
          +
          +
          def run(self, dataset, verbose=False, nb_verbose=False, restore_mode=False):  # noqa C901
               """
               Runs defined pipeline and prints out the result.
           
          @@ -2329,7 +2400,22 @@ 

          Parameters

          continue cube = cube_description['cube'] - cube(stage_models, dataset) + if not restore_mode: + cube(stage_models, dataset) + else: + if cube_index < self.depth - 1: + print(f"[Restoring experiment]: skipping cube {cube_index}") + continue + if cube_index == self.depth - 1: + print( + f"[Restoring experiment]: selecting models at cube number" + f"{cube_index} (some models could be lost)" + ) + if cube_index >= self.depth: + print( + f"[Restoring experiment]: applying cube number {cube_index}" + ) + cube(stage_models, dataset) # TODO: either delete this line completely # or come up with a way to restore any cube using just info about it in self.cubes @@ -2358,17 +2444,19 @@

          Parameters

          -def save(self, window_size=1500, mode='all') +def save(self, window_size: int = 1500, mode: str = 'all')
          -

          Saves all params of the experiment to save_path/experiment_id.

          +

          Saves all params of the experiment to save_path/experiment_id.

          Parameters

          window_size : int
          pixels size of window in html description (Default value = 1500)
          -
          +
          -Source code + +Expand source code +
          def save(self, window_size: int = 1500, mode: str = 'all'):
               """
               Saves all params of the experiment to save_path/experiment_id.
          @@ -2398,7 +2486,7 @@ 

          Parameters

          def save_models(self, mode='all')
          -

          Saves experiment models with respect to selected way of saving.

          +

          Saves experiment models with respect to selected way of saving.

          Parameters

          mode : str
          @@ -2406,9 +2494,11 @@

          Parameters

          'all' - save all models in experiment
          'tree' - save only stem and leaves from the last level
          'last' save only leaves from the last level
          -

          +
          -Source code + +Expand source code +
          def save_models(self, mode='all'):
               """
               Saves experiment models with respect to selected way of saving.
          @@ -2454,7 +2544,7 @@ 

          Parameters

          def select(self, query_string='', models_num=None, level=None)
          -

          Selects all models satisfying the query string +

          Selects all models satisfying the query string from all models on a particular depth.

          Parameters

          @@ -2467,7 +2557,7 @@

          Parameters

          Returns

          -
          result_topic_models : list of restored TopicModels
          +
          result_topic_models : list of restored TopicModels
           

          String Format

          @@ -2507,9 +2597,11 @@

          Examples

          "PerplexityScore@words < 1.1 * MINIMUM(PerplexityScore@all) and model.num_topics > 12" )

          -
          +
          -Source code + +Expand source code +
          def select(self, query_string='', models_num=None, level=None):
               """
               Selects all models satisfying the query string
          @@ -2628,7 +2720,7 @@ 

          Examples

          def set_criteria(self, cube_index, criteria)
          -

          Allows to edit model selection criteria +

          Allows to edit model selection criteria on each stage of the Experiment

          Parameters

          @@ -2641,9 +2733,11 @@

          Returns

          Nothing
           
          -
          +
          -Source code + +Expand source code +
          def set_criteria(self, cube_index, criteria):
               """
               Allows to edit model selection criteria
          @@ -2673,9 +2767,11 @@ 

          Returns

          def show(self)
          -

          Shows description of the experiment.

          +

          Shows description of the experiment.

          -Source code + +Expand source code +
          def show(self):
               """
               Shows description of the experiment.
          @@ -2687,17 +2783,19 @@ 

          Returns

          -def squeeze_models(self, depth=None) +def squeeze_models(self, depth: int = None)
          -

          Transforms models to dummies so as to occupy less RAM memory

          +

          Transforms models to dummies so as to occupy less RAM memory

          Parameters

          depth : int
          Models on what depth are to be squeezed, i.e. transformed to dummies
          -
          +
          -Source code + +Expand source code +
          def squeeze_models(self, depth: int = None):
               """Transforms models to dummies so as to occupy less RAM memory
           
          @@ -2764,7 +2862,7 @@ 

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/index.html b/docs/cooking_machine/index.html index 14a1e0d..0ca01e5 100644 --- a/docs/cooking_machine/index.html +++ b/docs/cooking_machine/index.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine API documentation - - + + @@ -19,38 +19,11 @@

          Module topicnet.cooking_machine

          -

          Cooking Machine

          -

          Cube

          -

          A unit of model training pipeline. This unit encapsulates an action over one or many model hyperparameters. This action and hyperparameter space are stored as cube properties and then saved in Experiment.

          -

          Input: model or list of models, regularizer or list of them, hyperparameter search sapce(grid), iterations number or a function defining it, custom metrics.
          -Output: models.
          -Body: performs actions over artm model. Can modify, create new models and alter their Experiment.

          -

          Model

          -

          A class containing Topic Model and its description:

          -
            -
          • stores topic model description;
          • -
          • outputs the description in human-readable form;
          • -
          • the model can only load and copy itself, the artm-model is an attribute and in order to change it is should be extracted, modified and put back;
          • -
          • stores experiment id;
          • -
          • stores parent model id;
          • -
          • stores model topic names;
          • -
          • stores regularizers list with their parameters;
          • -
          • stores modality weights;
          • -
          • stores save path for data, model and model information;
          • -
          • stores training metric values.
          • -
          -

          Experiment

          -

          Class providing experiment infrastructure:

          -
            -
          • keeps the description of all actions on the models;
          • -
          • provides human-readable log of experiment;
          • -
          • keeps the model training sequence in memory;
          • -
          • automaticly runs integrity check;
          • -
          • able to copy itself.
          • -
          -Source code + +Expand source code +
          from .dataset import Dataset
           from .dataset import BaseDataset
           from .experiment import Experiment
          @@ -63,52 +36,52 @@ 

          Sub-modules

          topicnet.cooking_machine.config_parser
          -

          Parsing text file into Experiment instance using strictyaml -(github.com/crdoconnor/strictyaml/) …

          +

          Parsing text file into Experiment instance using strictyaml +(github.com/crdoconnor/strictyaml/) …

          topicnet.cooking_machine.cubes
          -
          +
          topicnet.cooking_machine.dataset
          -
          +
          topicnet.cooking_machine.dataset_cooc
          -
          +
          topicnet.cooking_machine.experiment
          -
          +
          topicnet.cooking_machine.model_constructor
          -
          +
          topicnet.cooking_machine.model_tracking
          -
          +
          topicnet.cooking_machine.models
          -
          +
          topicnet.cooking_machine.pretty_output
          -
          +
          topicnet.cooking_machine.recipes
          -
          +
          topicnet.cooking_machine.rel_toolbox_lite
          -
          +
          topicnet.cooking_machine.routine
          -
          +
          @@ -120,7 +93,7 @@

          Sub-modules

          diff --git a/docs/cooking_machine/model_constructor.html b/docs/cooking_machine/model_constructor.html index c68ce3b..5699f8b 100644 --- a/docs/cooking_machine/model_constructor.html +++ b/docs/cooking_machine/model_constructor.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.model_constructor API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.model_constructor
          -Source code + +Expand source code +
          import warnings
           
           from typing import (
          @@ -244,10 +246,10 @@ 

          Module topicnet.cooking_machine.model_constructor

          Functions

          -def add_standard_scores(model, dictionary=None, main_modality='@lemmatized', all_modalities=('@lemmatized', '@ngramms')) +def add_standard_scores(model: artm.artm_model.ARTM, dictionary: artm.dictionary.Dictionary = None, main_modality: str = '@lemmatized', all_modalities: List[str] = ('@lemmatized', '@ngramms')) -> NoneType
          -

          Adds standard scores for the model.

          +

          Adds standard scores for the model.

          Parameters

          model
          @@ -258,9 +260,11 @@

          Parameters

           
          all_modalities
           
          -
          +
          -Source code + +Expand source code +
          def add_standard_scores(
                   model: artm.ARTM,
                   dictionary: artm.Dictionary = None,
          @@ -323,7 +327,7 @@ 

          Parameters

          def create_default_topics(specific_topics, background_topics)
          -

          Creates list of background topics and specific topics

          +

          Creates list of background topics and specific topics

          Parameters

          specific_topics : list or int
          @@ -332,9 +336,11 @@

          Parameters

           

          Returns

          -

          (list, list)

          +

          (list, list)

          -Source code + +Expand source code +
          def create_default_topics(specific_topics, background_topics):
               """
               Creates list of background topics and specific topics
          @@ -379,9 +385,11 @@ 

          Returns

          def init_model(topic_names, seed=None, class_ids=None)
          -

          Creates basic artm model

          +

          Creates basic artm model

          -Source code + +Expand source code +
          def init_model(topic_names, seed=None, class_ids=None):
               """
               Creates basic artm model
          @@ -401,10 +409,10 @@ 

          Returns

          -def init_simple_default_model(dataset, modalities_to_use, main_modality, specific_topics, background_topics) +def init_simple_default_model(dataset: Dataset, modalities_to_use: List[str], main_modality: str, specific_topics: List[str], background_topics: List[str]) -> artm.artm_model.ARTM
          -

          Creates simple artm.ARTM model with standard scores.

          +

          Creates simple artm.ARTM model with standard scores.

          Parameters

          dataset
          @@ -431,9 +439,11 @@

          Returns

          model : artm.ARTM
           
          -
          +
          -Source code + +Expand source code +
          def init_simple_default_model(
                   dataset: Dataset,
                   modalities_to_use: List[str] or Dict[str, float],
          @@ -544,7 +554,7 @@ 

          Index

          diff --git a/docs/cooking_machine/model_tracking.html b/docs/cooking_machine/model_tracking.html index 55c2975..eb64232 100644 --- a/docs/cooking_machine/model_tracking.html +++ b/docs/cooking_machine/model_tracking.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.model_tracking API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.model_tracking
          -Source code + +Expand source code +
          import numpy as np
           
           from copy import deepcopy
          @@ -351,18 +353,20 @@ 

          Classes

          class Tree -(tree=None) +(tree: dict = None)
          -

          Contains tree of an experiment and methods to work with it.

          +

          Contains tree of an experiment and methods to work with it.

          Initial stage.

          Parameters

          tree : dict
          tree of an experiment (Default value = None)
          -
          +
          -Source code + +Expand source code +
          class Tree(object):
               """
               Contains tree of an experiment and methods to work with it.
          @@ -669,7 +673,7 @@ 

          Static methods

          def transform_to_leaf(topic_model)
          -

          Transforms TopicModel to a leaf for the tree for further integration in the tree.

          +

          Transforms TopicModel to a leaf for the tree for further integration in the tree.

          Parameters

          topic_model : TopicModel
          @@ -679,9 +683,11 @@

          Returns

          dict
          leaf of the tree
          -
          +
          -Source code + +Expand source code +
          @staticmethod
           def transform_to_leaf(topic_model):
               """
          @@ -711,14 +717,16 @@ 

          Methods

          def add_model(self, topic_model)
          -

          Adds model in the tree of an experiment.

          +

          Adds model in the tree of an experiment.

          Parameters

          topic_model : TopicModel
          topic model
          -
          +
          -Source code + +Expand source code +
          def add_model(self, topic_model):
               """
               Adds model in the tree of an experiment.
          @@ -736,14 +744,16 @@ 

          Parameters

          def clone(self)
          -

          Clones Tree class object.

          +

          Clones Tree class object.

          Returns

          -
          tree : Tree
          +
          tree : Tree
          copy of Tree object
          -
          +
          -Source code + +Expand source code +
          def clone(self):
               """
               Clones Tree class object.
          @@ -762,14 +772,16 @@ 

          Returns

          def get_depth(self)
          -

          Gets current depth of the tree.

          +

          Gets current depth of the tree.

          Returns

          int
          depth of the tree
          -
          +
          -Source code + +Expand source code +
          def get_depth(self):
               """
               Gets current depth of the tree.
          @@ -787,14 +799,16 @@ 

          Returns

          def get_description(self)
          -

          Creates description of the tree.

          +

          Creates description of the tree.

          Returns

          list
          strings of description
          -
          +
          -Source code + +Expand source code +
          def get_description(self):
               """
               Creates description of the tree.
          @@ -814,14 +828,16 @@ 

          Returns

          def get_model_ids(self)
          -

          Gets models_ids of all models in the tree.

          +

          Gets models_ids of all models in the tree.

          Returns

          list
          model_ids of all models in the tree
          -
          +
          -Source code + +Expand source code +
          def get_model_ids(self):
               """
               Gets models_ids of all models in the tree.
          @@ -839,14 +855,16 @@ 

          Returns

          def prune(self, depth)
          -

          Prunes tree to get particular depth and updates it.

          +

          Prunes tree to get particular depth and updates it.

          Parameters

          depth : int
          desired tree depth
          -
          +
          -Source code + +Expand source code +
          def prune(self, depth):
               """
               Prunes tree to get particular depth and updates it.
          @@ -864,15 +882,17 @@ 

          Parameters

          def show(self)
          -

          Shows the tree of an experiment in text format. +

          Shows the tree of an experiment in text format. Shows description ot the tree.

          Returns

          str
          description in txt format
          -
          +
          -Source code + +Expand source code +
          def show(self):
               """
               Shows the tree of an experiment in text format.
          @@ -925,7 +945,7 @@ 

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/base_model.html b/docs/cooking_machine/models/base_model.html index d57972c..9467dc1 100644 --- a/docs/cooking_machine/models/base_model.html +++ b/docs/cooking_machine/models/base_model.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.base_model API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.base_model
          -Source code + +Expand source code +
          import json
           import os
           from copy import deepcopy
          @@ -350,9 +352,11 @@ 

          Functions

          def padd_model_name(model_id)
          -
          +
          -Source code + +Expand source code +
          def padd_model_name(model_id):
               padding = MODEL_NAME_LENGTH - len(model_id)
           
          @@ -374,7 +378,7 @@ 

          Classes

          (model_id=None, parent_model_id=None, experiment=None, *args, **kwargs)
          -

          Initialize stage, also used for loading previously saved experiments.

          +

          Initialize stage, also used for loading previously saved experiments.

          Parameters

          model_id : str
          @@ -383,9 +387,11 @@

          Parameters

          model id from which current model was created (Default value = None)
          experiment : Experiment
          the experiment to which the model is bound (Default value = None)
          -
          +
          -Source code + +Expand source code +
          class BaseModel(object):
               def __init__(self, model_id=None, parent_model_id=None, experiment=None, *args, **kwargs):
                   """
          @@ -683,8 +689,6 @@ 

          Parameters

          Subclasses

          • TopicModel
          • -
          • topicnet.tests.test_intratext_coherence_score.MockModel
          • -
          • topicnet.tests.test_topic_mapping.dummy_model

          Static methods

          @@ -692,13 +696,15 @@

          Static methods

          def load(path, *args, **kwargs)
          -

          Parameters

          +

          Parameters

          path : str
           
          -
          +
          -Source code + +Expand source code +
          @staticmethod
           def load(path, *args, **kwargs):
               """
          @@ -716,9 +722,11 @@ 

          Instance variables

          var depth
          -

          Returns depth of the model.

          +

          Returns depth of the model.

          -Source code + +Expand source code +
          @property
           def depth(self):
               """
          @@ -730,9 +738,11 @@ 

          Instance variables

          var description
          -
          +
          -Source code + +Expand source code +
          @property
           def description(self):
               """ """
          @@ -741,9 +751,11 @@ 

          Instance variables

          var model_default_save_path
          -
          +
          -Source code + +Expand source code +
          @property
           def model_default_save_path(self):
               """ """
          @@ -772,9 +784,11 @@ 

          Instance variables

          var model_id
          -
          +
          -Source code + +Expand source code +
          @property
           def model_id(self):
               """ """
          @@ -783,9 +797,11 @@ 

          Instance variables

          var parent_model_id
          -
          +
          -Source code + +Expand source code +
          @property
           def parent_model_id(self):
               """ """
          @@ -794,9 +810,11 @@ 

          Instance variables

          var score_functions
          -
          +
          -Source code + +Expand source code +
          @property
           def score_functions(self):
               """ """
          @@ -805,9 +823,11 @@ 

          Instance variables

          var scores
          -
          +
          -Source code + +Expand source code +
          @property
           def scores(self):
               """ """
          @@ -821,14 +841,16 @@ 

          Methods

          def add_cube(self, cube)
          -

          Adds cube to the model.

          +

          Adds cube to the model.

          Parameters

          cube : dict
          training cube params.
          -
          +
          -Source code + +Expand source code +
          def add_cube(self, cube):
               """
               Adds cube to the model.
          @@ -847,9 +869,11 @@ 

          Parameters

          def clone(self)
          -
          +
          -Source code + +Expand source code +
          def clone(self):
               """ """
               return deepcopy(self)
          @@ -859,9 +883,11 @@

          Parameters

          def get_jsonable_from_parameters(self)
          -
          +
          -Source code + +Expand source code +
          def get_jsonable_from_parameters(self):
               """ """
               raise NotImplementedError
          @@ -871,14 +897,16 @@

          Parameters

          def get_parameters(self)
          -

          Gets all params of the model.

          +

          Gets all params of the model.

          Returns

          dict
          parameters of the model
          -
          +
          -Source code + +Expand source code +
          def get_parameters(self):
               """
               Gets all params of the model.
          @@ -910,9 +938,11 @@ 

          Returns

          def get_phi(self, *args, **kwargs)
          -
          +
          -Source code + +Expand source code +
          def get_phi(self, *args, **kwargs):
               """ """
               raise NotImplementedError
          @@ -922,13 +952,15 @@

          Returns

          def get_theta(self, dataset=None, *args, **kwargs)
          -

          Parameters

          +

          Parameters

          dataset : Dataset
          (Default value = None)
          -
          +
          -Source code + +Expand source code +
          def get_theta(self, dataset=None, *args, **kwargs):
               """
           
          @@ -945,13 +977,15 @@ 

          Returns

          def save(self, path, *args, **kwargs)
          -

          Parameters

          +

          Parameters

          path : str
           
          -
          +
          -Source code + +Expand source code +
          def save(self, path, *args, **kwargs):
               """
           
          @@ -967,9 +1001,11 @@ 

          Returns

          def save_parameters(self, model_save_path=None)
          -

          Saves params of the model.

          +

          Saves params of the model.

          -Source code + +Expand source code +
          def save_parameters(self, model_save_path=None):
               """
               Saves params of the model.
          @@ -988,9 +1024,11 @@ 

          Returns

          def set_model_id_as_timestamp(self)
          -
          +
          -Source code + +Expand source code +
          def set_model_id_as_timestamp(self):
               """ """
               self._model_id = padd_model_name(get_timestamp_in_str_format())
          @@ -1047,7 +1085,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/base_regularizer.html b/docs/cooking_machine/models/base_regularizer.html index db1dbd1..2283130 100644 --- a/docs/cooking_machine/models/base_regularizer.html +++ b/docs/cooking_machine/models/base_regularizer.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.base_regularizer API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.base_regularizer<
          -Source code + +Expand source code +
          class BaseRegularizer:
               """
               Base regularizer class to construct custom regularizers.
          @@ -60,9 +62,11 @@ 

          Classes

          (name, tau, gamma=None)
          -

          Base regularizer class to construct custom regularizers.

          +

          Base regularizer class to construct custom regularizers.

          -Source code + +Expand source code +
          class BaseRegularizer:
               """
               Base regularizer class to construct custom regularizers.
          @@ -98,13 +102,15 @@ 

          Methods

          def attach(self, model)
          -

          Parameters

          +

          Parameters

          -
          model : ARTM model
          +
          model : ARTM model
          necessary to apply master component
          -
          +
          -Source code + +Expand source code +
          def attach(self, model):
               """
           
          @@ -120,9 +126,11 @@ 

          Methods

          def grad(self, pwt, nwt)
          -
          +
          -Source code + +Expand source code +
          def grad(self, pwt, nwt):
               raise NotImplementedError('grad method should be overrided in an inherited class')
          @@ -158,7 +166,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/base_score.html b/docs/cooking_machine/models/base_score.html index 6e28d94..ebdb7b3 100644 --- a/docs/cooking_machine/models/base_score.html +++ b/docs/cooking_machine/models/base_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.base_score API documentation - - + + @@ -21,8 +21,17 @@

          Module topicnet.cooking_machine.models.base_score
          -Source code + +Expand source code +
          import dill
          +
          +from typing import (
          +    Any,
          +    Callable,
          +    Dict,
          +)
          +
           from . import scores as tn_scores
           
           
          @@ -31,21 +40,93 @@ 

          Module topicnet.cooking_machine.models.base_score Base Class to construct custom score functions. """ - def __init__(self, name: str = None): # TODO: name should not be optional + _PRECOMPUTED_DATA_PARAMETER_NAME = 'precomputed_data' + + # TODO: name should not be optional + def __init__( + self, + name: str = None, + should_compute: Callable[[int], bool] or bool = None): """ Parameters ---------- - name: + name Name of the score - + should_compute + Function which decides whether the score should be computed + on the current fit iteration or not. + If `should_compute` is `None`, then score is going to be computed on every iteration. + At the same time, whatever function one defines, + score is always computed on the last fit iteration. + This is done for two reasons. + Firstly, so that the score is always computed at least once during `model._fit()`. + Secondly, so that `experiment.select()` works correctly. + + The parameter `should_compute` might be helpful + if the score is slow but one still needs + to get the dependence of the score on iteration + (for the described case, one may compute the score + on every even iteration or somehow else). + However, be aware that if `should_compute` is used for some model's scores, + then the scores may have different number of values in `model.scores`! + Number of score values is the number of times the scores was calculated; + first value corresponds to the first fit iteration + which passed `should_compute` etc. + + There are a couple of things also worth noting. + Fit iteration numbering starts from zero. + And every new `model._fit()` call is a new range of fit iterations. + + Examples + -------- + Scores created below are unworkable (as BaseScore has no `call` method inplemented). + These are just the examples of how one can create a score and set some of its parameters. + + Scores to be computed on every iteration: + + >>> score = BaseScore() + >>> score = BaseScore(should_compute=BaseScore.compute_always) + >>> score = BaseScore(should_compute=lambda i: True) + >>> score = BaseScore(should_compute=True) + + Scores to be computed only on the last iteration: + + >>> score = BaseScore(should_compute=BaseScore.compute_on_last) + >>> score = BaseScore(should_compute=lambda i: False) + >>> score = BaseScore(should_compute=False) + + Score to be computed only on even iterations: + + >>> score = BaseScore(should_compute=lambda i: i % 2 == 0) """ self._name = name + + if should_compute is None: + should_compute = self.compute_always + elif should_compute is True: + should_compute = self.compute_always + elif should_compute is False: + should_compute = self.compute_on_last + elif not isinstance(should_compute, type(lambda: None)): + raise TypeError(f'Unknown type of `should_compute`: {type(should_compute)}!') + else: + pass + + self._should_compute = should_compute self.value = [] if not hasattr(tn_scores, self.__class__.__name__): setattr(tn_scores, self.__class__.__name__, self.__class__) + @staticmethod + def compute_always(fit_iteration: int) -> bool: + return True + + @staticmethod + def compute_on_last(fit_iteration: int) -> bool: + return False + def __repr__(self): return f'{self.__class__.__name__}' @@ -81,7 +162,7 @@

          Module topicnet.cooking_machine.models.base_score self.value.append(score) - def call(self, model): + def call(self, model, precomputed_data: Dict[str, Any] = None): """ Call to custom score function. @@ -89,6 +170,12 @@

          Module topicnet.cooking_machine.models.base_score ---------- model : TopicModel a TopicNet model inherited from BaseModel + precomputed_data + Data which scores may share between each other during *one fit iteration*. + For example, if the model has several scores of the same score class, + and there is a heavy time consuming computation inside this score class, + it may be useful to perform the calculations *only once*, for one score instance, + and then make the result visible for all other scores that might need it. Returns ------- @@ -102,6 +189,45 @@

          Module topicnet.cooking_machine.models.base_score and then use this logic in query in Experiment's `select()` method. If one need ARTM model for score (not TopicNet one), it is available as model._model + + When creating a custom score class, + it is recommended to use `**kwargs` in the score's `call` method, + so that all `BaseScore` optional parameters are also available + in its successor score classes. + + Examples + -------- + + Score which uses `precomputed_data`: + + >>> import time + ... + >>> class NewScore(BaseScore): + ... def __init__(self, name: str, multiplier: float): + ... super().__init__(name=name) + ... + ... self._multiplier = multiplier + ... self._heavy_value_name = 'time_consuming_value_name' + ... + ... def call(self, model, precomputed_data = None): + ... if precomputed_data is None: + ... # Parameter `precomputed_data` is optional in BaseScore + ... # So this case also should be supported + ... heavy_value = self._compute_heavy(model) + ... elif self._heavy_value_name in precomputed_data: + ... # This is going to be fast + ... heavy_value = precomputed_data[self._heavy_value_name] + ... else: + ... # This is slow (but only one such call!) + ... heavy_value = self._compute_heavy(model) + ... precomputed_data[self._heavy_value_name] = heavy_value + ... + ... return heavy_value * self._multiplier + ... + ... def _compute_heavy(self, model): + ... time.sleep(100) # just for demonstration + ... + ... return 0 """ raise NotImplementedError('Define your score here')

          @@ -117,35 +243,152 @@

          Classes

          class BaseScore -(name=None) +(name: str = None, should_compute: Callable[[int], bool] = None)
          -

          Base Class to construct custom score functions.

          +

          Base Class to construct custom score functions.

          Parameters

          -

          name: -Name of the score

          +
          +
          name
          +
          Name of the score
          +
          should_compute
          +
          +

          Function which decides whether the score should be computed +on the current fit iteration or not. +If should_compute is None, then score is going to be computed on every iteration. +At the same time, whatever function one defines, +score is always computed on the last fit iteration. +This is done for two reasons. +Firstly, so that the score is always computed at least once during model._fit(). +Secondly, so that experiment.select() works correctly.

          +

          The parameter should_compute might be helpful +if the score is slow but one still needs +to get the dependence of the score on iteration +(for the described case, one may compute the score +on every even iteration or somehow else). +However, be aware that if should_compute is used for some model's scores, +then the scores may have different number of values in model.scores! +Number of score values is the number of times the scores was calculated; +first value corresponds to the first fit iteration +which passed should_compute etc.

          +

          There are a couple of things also worth noting. +Fit iteration numbering starts from zero. +And every new model._fit() call is a new range of fit iterations.

          +
          +
          +

          Examples

          +

          Scores created below are unworkable (as BaseScore has no call method inplemented). +These are just the examples of how one can create a score and set some of its parameters.

          +

          Scores to be computed on every iteration:

          +
          >>> score = BaseScore()
          +>>> score = BaseScore(should_compute=BaseScore.compute_always)
          +>>> score = BaseScore(should_compute=lambda i: True)
          +>>> score = BaseScore(should_compute=True)
          +
          +

          Scores to be computed only on the last iteration:

          +
          >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
          +>>> score = BaseScore(should_compute=lambda i: False)
          +>>> score = BaseScore(should_compute=False)
          +
          +

          Score to be computed only on even iterations:

          +
          >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
          +
          -Source code + +Expand source code +
          class BaseScore:
               """
               Base Class to construct custom score functions.
           
               """
          -    def __init__(self, name: str = None):  # TODO: name should not be optional
          +    _PRECOMPUTED_DATA_PARAMETER_NAME = 'precomputed_data'
          +
          +    # TODO: name should not be optional
          +    def __init__(
          +            self,
          +            name: str = None,
          +            should_compute: Callable[[int], bool] or bool = None):
                   """
           
                   Parameters
                   ----------
          -        name:
          +        name
                       Name of the score
          -
          +        should_compute
          +            Function which decides whether the score should be computed
          +            on the current fit iteration or not.
          +            If `should_compute` is `None`, then score is going to be computed on every iteration.
          +            At the same time, whatever function one defines,
          +            score is always computed on the last fit iteration.
          +            This is done for two reasons.
          +            Firstly, so that the score is always computed at least once during `model._fit()`.
          +            Secondly, so that `experiment.select()` works correctly.
          +
          +            The parameter `should_compute` might be helpful
          +            if the score is slow but one still needs
          +            to get the dependence of the score on iteration
          +            (for the described case, one may compute the score
          +            on every even iteration or somehow else).
          +            However, be aware that if `should_compute` is used for some model's scores,
          +            then the scores may have different number of values in `model.scores`!
          +            Number of score values is the number of times the scores was calculated;
          +            first value corresponds to the first fit iteration
          +            which passed `should_compute` etc.
          +
          +            There are a couple of things also worth noting.
          +            Fit iteration numbering starts from zero.
          +            And every new `model._fit()` call is a new range of fit iterations.
          +
          +        Examples
          +        --------
          +        Scores created below are unworkable (as BaseScore has no `call` method inplemented).
          +        These are just the examples of how one can create a score and set some of its parameters.
          +
          +        Scores to be computed on every iteration:
          +
          +        >>> score = BaseScore()
          +        >>> score = BaseScore(should_compute=BaseScore.compute_always)
          +        >>> score = BaseScore(should_compute=lambda i: True)
          +        >>> score = BaseScore(should_compute=True)
          +
          +        Scores to be computed only on the last iteration:
          +
          +        >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
          +        >>> score = BaseScore(should_compute=lambda i: False)
          +        >>> score = BaseScore(should_compute=False)
          +
          +        Score to be computed only on even iterations:
          +
          +        >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
                   """
                   self._name = name
          +
          +        if should_compute is None:
          +            should_compute = self.compute_always
          +        elif should_compute is True:
          +            should_compute = self.compute_always
          +        elif should_compute is False:
          +            should_compute = self.compute_on_last
          +        elif not isinstance(should_compute, type(lambda: None)):
          +            raise TypeError(f'Unknown type of `should_compute`: {type(should_compute)}!')
          +        else:
          +            pass
          +
          +        self._should_compute = should_compute
                   self.value = []
           
                   if not hasattr(tn_scores, self.__class__.__name__):
                       setattr(tn_scores, self.__class__.__name__, self.__class__)
           
          +    @staticmethod
          +    def compute_always(fit_iteration: int) -> bool:
          +        return True
          +
          +    @staticmethod
          +    def compute_on_last(fit_iteration: int) -> bool:
          +        return False
          +
               def __repr__(self):
                   return f'{self.__class__.__name__}'
           
          @@ -181,7 +424,7 @@ 

          Parameters

          self.value.append(score) - def call(self, model): + def call(self, model, precomputed_data: Dict[str, Any] = None): """ Call to custom score function. @@ -189,6 +432,12 @@

          Parameters

          ---------- model : TopicModel a TopicNet model inherited from BaseModel + precomputed_data + Data which scores may share between each other during *one fit iteration*. + For example, if the model has several scores of the same score class, + and there is a heavy time consuming computation inside this score class, + it may be useful to perform the calculations *only once*, for one score instance, + and then make the result visible for all other scores that might need it. Returns ------- @@ -202,26 +451,95 @@

          Parameters

          and then use this logic in query in Experiment's `select()` method. If one need ARTM model for score (not TopicNet one), it is available as model._model + + When creating a custom score class, + it is recommended to use `**kwargs` in the score's `call` method, + so that all `BaseScore` optional parameters are also available + in its successor score classes. + + Examples + -------- + + Score which uses `precomputed_data`: + + >>> import time + ... + >>> class NewScore(BaseScore): + ... def __init__(self, name: str, multiplier: float): + ... super().__init__(name=name) + ... + ... self._multiplier = multiplier + ... self._heavy_value_name = 'time_consuming_value_name' + ... + ... def call(self, model, precomputed_data = None): + ... if precomputed_data is None: + ... # Parameter `precomputed_data` is optional in BaseScore + ... # So this case also should be supported + ... heavy_value = self._compute_heavy(model) + ... elif self._heavy_value_name in precomputed_data: + ... # This is going to be fast + ... heavy_value = precomputed_data[self._heavy_value_name] + ... else: + ... # This is slow (but only one such call!) + ... heavy_value = self._compute_heavy(model) + ... precomputed_data[self._heavy_value_name] = heavy_value + ... + ... return heavy_value * self._multiplier + ... + ... def _compute_heavy(self, model): + ... time.sleep(100) # just for demonstration + ... + ... return 0 """ raise NotImplementedError('Define your score here')

          Subclasses

          Static methods

          +
          +def compute_always(fit_iteration: int) -> bool +
          +
          +
          +
          + +Expand source code + +
          @staticmethod
          +def compute_always(fit_iteration: int) -> bool:
          +    return True
          +
          +
          +
          +def compute_on_last(fit_iteration: int) -> bool +
          +
          +
          +
          + +Expand source code + +
          @staticmethod
          +def compute_on_last(fit_iteration: int) -> bool:
          +    return False
          +
          +
          def load(path)
          -
          +
          -Source code + +Expand source code +
          @classmethod
           def load(cls, path):
               with open(path, "rb") as f:
          @@ -234,14 +552,20 @@ 

          Static methods

          Methods

          -def call(self, model) +def call(self, model, precomputed_data: Dict[str, Any] = None)
          -

          Call to custom score function.

          +

          Call to custom score function.

          Parameters

          model : TopicModel
          a TopicNet model inherited from BaseModel
          +
          precomputed_data
          +
          Data which scores may share between each other during one fit iteration. +For example, if the model has several scores of the same score class, +and there is a heavy time consuming computation inside this score class, +it may be useful to perform the calculations only once, for one score instance, +and then make the result visible for all other scores that might need it.

          Returns

          @@ -252,10 +576,47 @@

          Notes

          Higher score not necessarily should correspond to better model. It is up to user to decide what the meaning is behind the score, and then use this logic in query in Experiment's select() method.

          -

          If one need ARTM model for score (not TopicNet one), it is available as model._model

          +

          If one need ARTM model for score (not TopicNet one), it is available as model._model

          +

          When creating a custom score class, +it is recommended to use **kwargs in the score's call method, +so that all BaseScore optional parameters are also available +in its successor score classes.

          +

          Examples

          +

          Score which uses precomputed_data:

          +
          >>> import time
          +...
          +>>> class NewScore(BaseScore):
          +...     def __init__(self, name: str, multiplier: float):
          +...         super().__init__(name=name)
          +...
          +...         self._multiplier = multiplier
          +...         self._heavy_value_name = 'time_consuming_value_name'
          +...
          +...     def call(self, model, precomputed_data = None):
          +...         if precomputed_data is None:
          +...             # Parameter <code>precomputed\_data</code> is optional in BaseScore
          +...             # So this case also should be supported
          +...             heavy_value = self._compute_heavy(model)
          +...         elif self._heavy_value_name in precomputed_data:
          +...             # This is going to be fast
          +...             heavy_value = precomputed_data[self._heavy_value_name]
          +...         else:
          +...             # This is slow (but only one such call!)
          +...             heavy_value = self._compute_heavy(model)
          +...             precomputed_data[self._heavy_value_name] = heavy_value
          +...
          +...         return heavy_value * self._multiplier
          +...
          +...     def _compute_heavy(self, model):
          +...         time.sleep(100)  # just for demonstration
          +...
          +...         return 0
          +
          -Source code -
          def call(self, model):
          +
          +Expand source code
          +
          +
          def call(self, model, precomputed_data: Dict[str, Any] = None):
               """
               Call to custom score function.
           
          @@ -263,6 +624,12 @@ 

          Notes

          ---------- model : TopicModel a TopicNet model inherited from BaseModel + precomputed_data + Data which scores may share between each other during *one fit iteration*. + For example, if the model has several scores of the same score class, + and there is a heavy time consuming computation inside this score class, + it may be useful to perform the calculations *only once*, for one score instance, + and then make the result visible for all other scores that might need it. Returns ------- @@ -276,6 +643,45 @@

          Notes

          and then use this logic in query in Experiment's `select()` method. If one need ARTM model for score (not TopicNet one), it is available as model._model + + When creating a custom score class, + it is recommended to use `**kwargs` in the score's `call` method, + so that all `BaseScore` optional parameters are also available + in its successor score classes. + + Examples + -------- + + Score which uses `precomputed_data`: + + >>> import time + ... + >>> class NewScore(BaseScore): + ... def __init__(self, name: str, multiplier: float): + ... super().__init__(name=name) + ... + ... self._multiplier = multiplier + ... self._heavy_value_name = 'time_consuming_value_name' + ... + ... def call(self, model, precomputed_data = None): + ... if precomputed_data is None: + ... # Parameter `precomputed_data` is optional in BaseScore + ... # So this case also should be supported + ... heavy_value = self._compute_heavy(model) + ... elif self._heavy_value_name in precomputed_data: + ... # This is going to be fast + ... heavy_value = precomputed_data[self._heavy_value_name] + ... else: + ... # This is slow (but only one such call!) + ... heavy_value = self._compute_heavy(model) + ... precomputed_data[self._heavy_value_name] = heavy_value + ... + ... return heavy_value * self._multiplier + ... + ... def _compute_heavy(self, model): + ... time.sleep(100) # just for demonstration + ... + ... return 0 """ raise NotImplementedError('Define your score here')
          @@ -284,9 +690,11 @@

          Notes

          def save(self, path)
          -
          +
          -Source code + +Expand source code +
          def save(self, path):
               with open(path, "wb") as f:
                   dill.dump(self, f)
          @@ -296,14 +704,16 @@

          Notes

          def update(self, score)
          -

          Parameters

          +

          Parameters

          score : float
          score value
          -

          Returns

          +

          Returns

          -Source code + +Expand source code +
          def update(self, score):
               """
           
          @@ -346,8 +756,10 @@ 

          Index

          • BaseScore

            -
              +
              • call
              • +
              • compute_always
              • +
              • compute_on_last
              • load
              • save
              • update
              • @@ -359,7 +771,7 @@

                -

                Generated by pdoc 0.6.3.

                +

                Generated by pdoc 0.8.1.

                diff --git a/docs/cooking_machine/models/blei_lafferty_score.html b/docs/cooking_machine/models/blei_lafferty_score.html index c2c9e5c..b2c1dde 100644 --- a/docs/cooking_machine/models/blei_lafferty_score.html +++ b/docs/cooking_machine/models/blei_lafferty_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.blei_lafferty_score API documentation - - + + @@ -21,8 +21,13 @@

                Module topicnet.cooking_machine.models.blei_lafferty_sco
                -Source code + +Expand source code +
                import numpy as np
                +
                +from typing import Callable
                +
                 from .base_score import BaseScore
                 
                 
                @@ -35,7 +40,11 @@ 

                Module topicnet.cooking_machine.models.blei_lafferty_sco to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better """ - def __init__(self, name: str = None, num_top_tokens: int = 30): + def __init__( + self, + name: str = None, + num_top_tokens: int = 30, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -46,7 +55,7 @@

                Module topicnet.cooking_machine.models.blei_lafferty_sco now many tokens we consider to be """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.num_top_tokens = num_top_tokens @@ -84,7 +93,7 @@

                Module topicnet.cooking_machine.models.blei_lafferty_sco scores = phi * multiplier return scores - def call(self, model): + def call(self, model, **kwargs): modalities = list(model.class_ids.keys()) score = 0 @@ -110,10 +119,10 @@

                Classes

                class BleiLaffertyScore -(name=None, num_top_tokens=30) +(name: str = None, num_top_tokens: int = 30, should_compute: Callable[[int], bool] = None)
                -

                This score implements method described in 2009 paper +

                This score implements method described in 2009 paper Blei, David M., and John D. Lafferty. "Topic models." Text Mining. Chapman and Hall/CRC, 2009. 101-124. At the core this score helps to discover tokens that are most likely @@ -125,9 +134,11 @@

                Parameters

                name of the score
                num_top_tokens : int
                now many tokens we consider to be
                -

                +

          -Source code + +Expand source code +
          class BleiLaffertyScore(BaseScore):
               """
               This score implements method described in 2009 paper
          @@ -137,7 +148,11 @@ 

          Parameters

          to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better """ - def __init__(self, name: str = None, num_top_tokens: int = 30): + def __init__( + self, + name: str = None, + num_top_tokens: int = 30, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -148,7 +163,7 @@

          Parameters

          now many tokens we consider to be """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.num_top_tokens = num_top_tokens @@ -186,7 +201,7 @@

          Parameters

          scores = phi * multiplier return scores - def call(self, model): + def call(self, model, **kwargs): modalities = list(model.class_ids.keys()) score = 0 @@ -239,7 +254,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/dummy_topic_model.html b/docs/cooking_machine/models/dummy_topic_model.html index 979efea..41aedb1 100644 --- a/docs/cooking_machine/models/dummy_topic_model.html +++ b/docs/cooking_machine/models/dummy_topic_model.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.dummy_topic_model API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.dummy_topic_model
          -Source code + +Expand source code +
          import artm
           import json
           import os
          @@ -394,6 +396,9 @@ 

          Module topicnet.cooking_machine.models.dummy_topic_model raise InvalidOperationError() def add_cube(self, cube): + raise InvalidOperationError() + + def describe_regularizers(self): raise InvalidOperationError()

          @@ -411,12 +416,14 @@

          Classes

          (scores, init_parameters=None, model_id=None, parent_model_id=None, description=None, experiment=None, save_path=None, *args, **kwargs)
          -

          Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

          +

          Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

          Notes

          Only TopicModel supposed to be able to create DummyTopicModel -("private" < access < "public")

          +("private" < access < "public")

          -Source code + +Expand source code +
          class DummyTopicModel(TopicModel):
               _dummy_attribute = '_is_dummy'
           
          @@ -702,9 +709,11 @@ 

          Instance variables

          var class_ids
          -
          +
          -Source code + +Expand source code +
          @property
           def class_ids(self):
               """"""
          @@ -718,19 +727,21 @@ 

          Methods

          def get_init_parameters(self, not_include=None)
          -
          +
          -Source code + +Expand source code +
          def get_init_parameters(self, not_include=None):
               """"""
               return self._init_parameters
          -def restore(self, dataset=None) +def restore(self, dataset: Dataset = None)
          -

          Restores dummy to original TopicModel

          +

          Restores dummy to original TopicModel

          Tries to load the data from drive (if model was saved). Otherwise tries to train the model using parent model, experiment and dataset.

          Parameters

          @@ -744,9 +755,11 @@

          Returns

          TopicModel
          Restored topic model
          -
          +
          -Source code + +Expand source code +
          def restore(self, dataset: Dataset = None):
               """Restores dummy to original TopicModel
           
          @@ -793,6 +806,7 @@ 

          Inherited members

        • get_phi
        • get_phi_dense
        • get_phi_sparse
        • +
        • get_regularizer
        • get_theta
        • load
        • make_dummy
        • @@ -811,9 +825,11 @@

          Inherited members

          (message="Dummy model can't do this")
          -

          Unspecified run-time error.

          +

          Unspecified run-time error.

          -Source code + +Expand source code +
          class InvalidOperationError(RuntimeError):
               def __init__(self, message='Dummy model can\'t do this'):
                   super().__init__(message)
          @@ -858,7 +874,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/example_score.html b/docs/cooking_machine/models/example_score.html index 6d9320c..22f2653 100644 --- a/docs/cooking_machine/models/example_score.html +++ b/docs/cooking_machine/models/example_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.example_score API documentation - - + + @@ -21,8 +21,13 @@

          Module topicnet.cooking_machine.models.example_score
          -Source code + +Expand source code +
          import numpy as np
          +
          +from typing import Callable
          +
           from .base_score import BaseScore
           
           
          @@ -34,7 +39,11 @@ 

          Module topicnet.cooking_machine.models.example_scoreModule topicnet.cooking_machine.models.example_scoreClasses

          class ScoreExample -(name=None, token_threshold=0.001) +(name: str = None, token_threshold: float = 0.001, should_compute: Callable[[int], bool] = None)
          -

          Example score that calculates +

          Example score that calculates average size of topic kernel across all topics. We inherit from BaseScore in order to have self.value property and self.update() method (the internal logic of TopicNet relies on them)

          @@ -93,9 +102,11 @@

          Parameters

          name of the score
          token_threshold : float
          what probabilities to take as token belonging to the topic
          -
          +

          -Source code + +Expand source code +
          class ScoreExample(BaseScore):
               """
               Example score that calculates
          @@ -104,7 +115,11 @@ 

          Parameters

          (the internal logic of TopicNet relies on them) """ - def __init__(self, name: str = None, token_threshold: float = 1e-3): + def __init__( + self, + name: str = None, + token_threshold: float = 1e-3, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -115,11 +130,11 @@

          Parameters

          what probabilities to take as token belonging to the topic """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.threshold = token_threshold - def call(self, model): + def call(self, model, **kwargs): """ Method that calculates the score @@ -145,10 +160,10 @@

          Ancestors

          Methods

          -def call(self, model) +def call(self, model, **kwargs)
          -

          Method that calculates the score

          +

          Method that calculates the score

          Parameters

          model : TopicModel
          @@ -158,10 +173,12 @@

          Returns

          score : float
          mean kernel size for all topics in the model
          -
          +
          -Source code -
          def call(self, model):
          +
          +Expand source code
          +
          +
          def call(self, model, **kwargs):
               """
               Method that calculates the score
           
          @@ -219,7 +236,7 @@ 

          diff --git a/docs/cooking_machine/models/frozen_score.html b/docs/cooking_machine/models/frozen_score.html index b37d832..2d424ba 100644 --- a/docs/cooking_machine/models/frozen_score.html +++ b/docs/cooking_machine/models/frozen_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.frozen_score API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.frozen_score
          -Source code + +Expand source code +
          import warnings
           
           from enum import Enum
          @@ -121,18 +123,63 @@ 

          Classes

          class FrozenScore -(value, original_score=None) +(value: List[Union[float, NoneType]], original_score: BaseScore = None)
          -

          Custom scores can have anything inside. +

          Custom scores can have anything inside. So there is a probability that pickle will not be able to dump them. Frozen score helps to store the value of the original score without its internal logic, so as it can be saved.

          Parameters

          -

          name: -Name of the score

          +
          +
          name
          +
          Name of the score
          +
          should_compute
          +
          +

          Function which decides whether the score should be computed +on the current fit iteration or not. +If should_compute is None, then score is going to be computed on every iteration. +At the same time, whatever function one defines, +score is always computed on the last fit iteration. +This is done for two reasons. +Firstly, so that the score is always computed at least once during model._fit(). +Secondly, so that experiment.select() works correctly.

          +

          The parameter should_compute might be helpful +if the score is slow but one still needs +to get the dependence of the score on iteration +(for the described case, one may compute the score +on every even iteration or somehow else). +However, be aware that if should_compute is used for some model's scores, +then the scores may have different number of values in model.scores! +Number of score values is the number of times the scores was calculated; +first value corresponds to the first fit iteration +which passed should_compute etc.

          +

          There are a couple of things also worth noting. +Fit iteration numbering starts from zero. +And every new model._fit() call is a new range of fit iterations.

          +
          +
          +

          Examples

          +

          Scores created below are unworkable (as BaseScore has no call method inplemented). +These are just the examples of how one can create a score and set some of its parameters.

          +

          Scores to be computed on every iteration:

          +
          >>> score = BaseScore()
          +>>> score = BaseScore(should_compute=BaseScore.compute_always)
          +>>> score = BaseScore(should_compute=lambda i: True)
          +>>> score = BaseScore(should_compute=True)
          +
          +

          Scores to be computed only on the last iteration:

          +
          >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
          +>>> score = BaseScore(should_compute=lambda i: False)
          +>>> score = BaseScore(should_compute=False)
          +
          +

          Score to be computed only on even iterations:

          +
          >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
          +
          -Source code + +Expand source code +
          class FrozenScore(BaseScore):
               """
               Custom scores can have anything inside.
          @@ -215,15 +262,17 @@ 

          Ancestors

          Methods

          -def update(self, score_value) +def update(self, score_value: float) -> NoneType
          -

          Update is not supposed to be applied to Frozen score. +

          Update is not supposed to be applied to Frozen score. It is not supposed to be changed. Still, the situation with an endeavour to update can generally happen if one tries -to train the model further after loading.

          +to train the model further after loading.

          -Source code + +Expand source code +
          def update(self, score_value: float) -> None:
               """
               Update is not supposed to be applied to Frozen score.
          @@ -286,7 +335,7 @@ 

          diff --git a/docs/cooking_machine/models/index.html b/docs/cooking_machine/models/index.html index ba57f76..6d87e37 100644 --- a/docs/cooking_machine/models/index.html +++ b/docs/cooking_machine/models/index.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models API documentation - - + + @@ -19,57 +19,15 @@

          Module topicnet.cooking_machine.models

          -

          Models and scores

          -

          Availiable models:

          -
            -
          • BaseModel — Parent class for model creation
          • -
          • TopicModel — a wrapper class for bigartm topic model
          • -
          • DummyTopicModel — a fake model that contains training information but not actual artm model. Needed to save memory space during the training.
          • -
          -
          -

          Availiable scores:

          -
            -
          • BaseScore — a parent class for all the Strategies
          • -
          • ExampleScore — Example of minimal working example of custom score
          • -
          • IntratextCoherenceScore — score that calculates coherence as a measure of interpretability of the model using raw documents from dataset. Calculation-heavy score. Recommended to be used after model training
          • -
          • BleiLaffertyScore — An experimental light-weight score to estimate interpretability of the topics
          • -
          • SemanticRadiusScore — An experimental score reflecting whether collection is adequately described by topics. Lower if better. Calculation-heavy score.
          • -
          -
          -

          Internal model structure

          -

          main model attributes:

          -
            -
          • model_id — a model string id, unique for its Experiment.

          • -
          • scores — dict of lists, each list corresponds to the score value or list of values at certain training stage.

          • -
          • custom_scores — variable providing custom scores for the model

          • -
          • custom_regularizers — variable providing custom regularizers for the model. An example is provided in topic_prior_regularizer.py.

          • -
          -

          main model methods:

          -
            -
          • _fit — function performing model training. Takes the dataset and number of iterations. Optionally, you can pass custom_regularizers here, if you wish to apply them to a single iteration.

            -

            Important Notice! We assume that the model training happens through Cube interface and this method, while important should never be used by users if they are hope to have their actions logged

          • -
          • get_phi — function that returns p(token|topic/cluster) probability distributions that returns pandas.DataFrame with tokens as index and topics/clusters as columns

            -

            Important Notice! Strictly speaking the function returns degree to which token belongs to the topic/cluster and shouldn’t be a probability distribution. But scince its main use-case intended for topic models some of the functions using this method might work incorrectly in non-distribution case

          • -
          • get_theta — function that returns p(topic/cluster|document) probability distributions that returns pandas.DataFrame with topics/clusters as index and document ids as columns.

            -

            Important Notice! Strictly speaking the function returns degree to which document belongs to the topic/cluster and shouldn’t be a probability distribution. But scince its main use-case intended for topic models some of the functions using this method might work incorrectly in non-distribution case

          • -
          • save — saves model to the path directory.

          • -
          • load — loads model from the path directory

          • -
          • clone — creates copy of a model.

          • -
          • get_jsonable_from_parameters — turns model parameters to jsonable format for logging purposes

          • -
          -
          -

          What do you need to create your own model?

          -

          Following this steps you should be able to code a model integrated with the library methods:

          -
            -
          1. New model class is inherrited from BaseModel

          2. -
          3. A child class should contain methods __init__, _fit, get_phi, get_theta, save, load, clone, get_jsonable_from_parameters.

          4. -
          -Source code + +Expand source code +
          from .base_model import BaseModel
           from .topic_model import TopicModel
           from .dummy_topic_model import DummyTopicModel
          +
           from .base_score import BaseScore
           from .example_score import ScoreExample
           from .intratext_coherence_score import IntratextCoherenceScore
          @@ -84,59 +42,59 @@ 

          Sub-modules

          topicnet.cooking_machine.models.base_model
          -
          +
          topicnet.cooking_machine.models.base_regularizer
          -
          +
          topicnet.cooking_machine.models.base_score
          -
          +
          topicnet.cooking_machine.models.blei_lafferty_score
          -
          +
          topicnet.cooking_machine.models.dummy_topic_model
          -
          +
          topicnet.cooking_machine.models.example_score
          -
          +
          topicnet.cooking_machine.models.frozen_score
          -
          +
          topicnet.cooking_machine.models.intratext_coherence_score
          -
          +
          topicnet.cooking_machine.models.scores
          -
          +
          topicnet.cooking_machine.models.scores_wrapper
          -
          +
          topicnet.cooking_machine.models.semantic_radius_score
          -
          +
          topicnet.cooking_machine.models.thetaless_regularizer
          -
          +
          topicnet.cooking_machine.models.topic_model
          -
          +
          topicnet.cooking_machine.models.topic_prior_regularizer
          -
          +
          @@ -148,7 +106,7 @@

          Sub-modules

          diff --git a/docs/cooking_machine/models/intratext_coherence_score.html b/docs/cooking_machine/models/intratext_coherence_score.html index f110817..2c0e902 100644 --- a/docs/cooking_machine/models/intratext_coherence_score.html +++ b/docs/cooking_machine/models/intratext_coherence_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.intratext_coherence_score API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.intratext_coheren
          -Source code + +Expand source code +
          import dill
           import numpy as np
           import pandas as pd
          @@ -32,6 +34,7 @@ 

          Module topicnet.cooking_machine.models.intratext_coheren from collections import defaultdict from enum import Enum, IntEnum, auto from typing import ( + Callable, Dict, List, Optional, @@ -129,6 +132,7 @@

          Module topicnet.cooking_machine.models.intratext_coheren self, dataset: Union[Dataset, str], name: str = None, + should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, @@ -213,7 +217,7 @@

          Module topicnet.cooking_machine.models.intratext_coheren >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) """ # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self._keep_dataset = keep_dataset @@ -387,7 +391,7 @@

          Module topicnet.cooking_machine.models.intratext_coheren return score - def call(self, model: BaseModel) -> float: + def call(self, model: BaseModel, **kwargs) -> float: if (self._current_iteration - self._start_fit_iteration) % self._fit_iteration_step != 0: self._current_iteration += 1 @@ -736,10 +740,10 @@

          Classes

          class ComputationMethod -(*args, **kwargs) +(value, names=None, *, module=None, qualname=None, type=None, start=1)
          -

          Ways to compute intra-text coherence +

          Ways to compute intra-text coherence (see more about coherence below in IntratextCoherenceScore)

          Attributes

          SEGMENT_LENGTH : @@ -751,9 +755,11 @@

          Attributes

          Sum of specificities for the topic over words in given window. The process is as follows: word of the topic is found in text, it is the center of the first window; -next word of the topic is found (outside of the previous window), window; etc

          +next word of the topic is found (outside of the previous window), window; etc

          -Source code + +Expand source code +
          class ComputationMethod(IntEnum):
               """
               Ways to compute intra-text coherence
          @@ -786,24 +792,24 @@ 

          Class variables

          var SEGMENT_LENGTH
          -
          +
          var SEGMENT_WEIGHT
          -
          +
          var SUM_OVER_WINDOW
          -
          +
          class IntratextCoherenceScore -(dataset, name=None, keep_dataset_in_memory=None, keep_dataset=True, documents=None, documents_fraction=1.0, text_type=, computation_method=, word_topic_relatedness=, specificity_estimation=, max_num_out_of_topic_words=10, window=20, start_fit_iteration=0, fit_iteration_step=1, seed=11221963, verbose=False) +(dataset: Union[topicnet.cooking_machine.dataset.Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False)
          -

          Computes intratext coherence

          +

          Computes intratext coherence

          For each topic of topic model its distribution throughout document collection is observed. Hypothetically, the better the topic, the more often it is represented by long segments of words highly related to the topic. @@ -830,16 +836,16 @@

          Parameters

          documents_fraction
          The fraction of all the documents in the Dataset to be used for coherence computation if documents parameter is not specified
          -
          text_type : TextType
          +
          text_type : TextType
          What text to use when computing coherence: raw text or VW text Preferable to use VW (as it is usually preprocessed, stop-words removed etc.), and with words in natural order. Score needs "real" text to compute coherence
          -
          computation_method : ComputationMethod
          +
          computation_method : ComputationMethod
          The way to compute intra-text coherence
          -
          word_topic_relatedness : WordTopicRelatednessType
          +
          word_topic_relatedness : WordTopicRelatednessType
          How to estimate word relevance to topic: using p(w | t) or p(t | w)
          -
          specificity_estimation : SpecificityEstimationMethod
          +
          specificity_estimation : SpecificityEstimationMethod
          How to estimate specificity of word to topic
          max_num_out_of_topic_words : int
          In case computation_method = ComputationMethod.SEGMENT_LENGTH or @@ -866,7 +872,7 @@

          Notes

          at the end of the training process (and not in the dependence of score on iteration), one should adjust start_fit_iteration and fit_iteration_step correspondingly. For example:

          -
          >>> # dataset = Dataset(...)
          +
          >>> # dataset = Dataset(...)
           >>> # topic_model = TopicModel(...)
           >>> num_iterations = 100
           >>> topic_model.custom_scores['intratext_coherence'] = IntratextCoherenceScore(
          @@ -874,9 +880,11 @@ 

          Notes

          >>> start_fit_iteration=num_iterations - 1 # last iteration: starting from zero >>> ) >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) -
          +

          -Source code + +Expand source code +
          class IntratextCoherenceScore(BaseScore):
               """Computes intratext coherence
           
          @@ -891,6 +899,7 @@ 

          Notes

          self, dataset: Union[Dataset, str], name: str = None, + should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, @@ -975,7 +984,7 @@

          Notes

          >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) """ # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self._keep_dataset = keep_dataset @@ -1149,7 +1158,7 @@

          Notes

          return score - def call(self, model: BaseModel) -> float: + def call(self, model: BaseModel, **kwargs) -> float: if (self._current_iteration - self._start_fit_iteration) % self._fit_iteration_step != 0: self._current_iteration += 1 @@ -1493,21 +1502,23 @@

          Ancestors

          Static methods

          -def load(path) +def load(path: str)
          -

          Parameters

          +

          Parameters

          path
           

          Returns

          -
          IntratextCoherenceScore
          +
          IntratextCoherenceScore
           
          -
          +
          -Source code + +Expand source code +
          @classmethod
           def load(cls, path: str):
               """
          @@ -1541,11 +1552,13 @@ 

          Returns

          Instance variables

          -
          var dataset
          +
          var datasetDataset
          -
          +
          -Source code + +Expand source code +
          @property
           def dataset(self) -> Dataset:
               return self._dataset
          @@ -1555,12 +1568,14 @@

          Instance variables

          Methods

          -def compute(self, model, topics=None, documents=None) +def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) -> Dict[str, Union[float, NoneType]]
          -
          +
          -Source code + +Expand source code +
          def compute(
                   self,
                   model: BaseModel,
          @@ -1624,12 +1639,14 @@ 

          Methods

          -def save(self, path) +def save(self, path: str) -> NoneType
          -
          +
          -Source code + +Expand source code +
          def save(self, path: str) -> None:
               dataset = self._dataset
               self._dataset = None
          @@ -1653,10 +1670,10 @@ 

          Inherited members

          class SpecificityEstimationMethod -(*args, **kwargs) +(value, names=None, *, module=None, qualname=None, type=None, start=1)
          -

          Way to estimate how particular word is specific for particular topic. +

          Way to estimate how particular word is specific for particular topic. Unlike probability, eg. p(w | t), specificity_estimation takes into account values for all topics, eg. p(w | t_1), p(w | t_2), …, p(w | t_n): the higher the value p(w | t) comparing other p(w | t_i), @@ -1669,9 +1686,11 @@

          Attributes

          extract maximum among probabilities for the word and other topics AVERAGE : From probability, corresponding to word and topic, -extract average among probabilities for the word and other topics

          +extract average among probabilities for the word and other topics

          -Source code + +Expand source code +
          class SpecificityEstimationMethod(IntEnum):
               """
               Way to estimate how particular word is specific for particular topic.
          @@ -1705,26 +1724,28 @@ 

          Class variables

          var AVERAGE
          -
          +
          var MAXIMUM
          -
          +
          var NONE
          -
          +
          class TextType -(*args, **kwargs) +(value, names=None, *, module=None, qualname=None, type=None, start=1)
          -

          An enumeration.

          +

          An enumeration.

          -Source code + +Expand source code +
          class TextType(Enum):
               VW_TEXT = VW_TEXT_COL
               RAW_TEXT = RAW_TEXT_COL
          @@ -1737,27 +1758,29 @@

          Class variables

          var RAW_TEXT
          -
          +
          var VW_TEXT
          -
          +
          class WordTopicRelatednessType -(*args, **kwargs) +(value, names=None, *, module=None, qualname=None, type=None, start=1)
          -

          Word-topic relatedness estimate

          +

          Word-topic relatedness estimate

          Attributes

          PWT : p(w | t) PTW : -p(t | w)

          +p(t | w)

          -Source code + +Expand source code +
          class WordTopicRelatednessType(IntEnum):
               """
               Word-topic relatedness estimate
          @@ -1782,11 +1805,11 @@ 

          Class variables

          var PTW
          -
          +
          var PWT
          -
          +
          @@ -1851,7 +1874,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/scores.html b/docs/cooking_machine/models/scores.html index 2f8238f..016e55a 100644 --- a/docs/cooking_machine/models/scores.html +++ b/docs/cooking_machine/models/scores.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.scores API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.scores

          -Source code + +Expand source code +
          from .example_score import ScoreExample
           from .intratext_coherence_score import IntratextCoherenceScore
           from .blei_lafferty_score import BleiLaffertyScore
          @@ -41,10 +43,10 @@ 

          Classes

          class BleiLaffertyScore -(name=None, num_top_tokens=30) +(name: str = None, num_top_tokens: int = 30, should_compute: Callable[[int], bool] = None)
          -

          This score implements method described in 2009 paper +

          This score implements method described in 2009 paper Blei, David M., and John D. Lafferty. "Topic models." Text Mining. Chapman and Hall/CRC, 2009. 101-124. At the core this score helps to discover tokens that are most likely @@ -56,9 +58,11 @@

          Parameters

          name of the score
          num_top_tokens : int
          now many tokens we consider to be
          -
          +

          -Source code + +Expand source code +
          class BleiLaffertyScore(BaseScore):
               """
               This score implements method described in 2009 paper
          @@ -68,7 +72,11 @@ 

          Parameters

          to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better """ - def __init__(self, name: str = None, num_top_tokens: int = 30): + def __init__( + self, + name: str = None, + num_top_tokens: int = 30, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -79,7 +87,7 @@

          Parameters

          now many tokens we consider to be """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.num_top_tokens = num_top_tokens @@ -117,7 +125,7 @@

          Parameters

          scores = phi * multiplier return scores - def call(self, model): + def call(self, model, **kwargs): modalities = list(model.class_ids.keys()) score = 0 @@ -147,10 +155,10 @@

          Inherited members

          class IntratextCoherenceScore -(dataset, name=None, keep_dataset_in_memory=None, keep_dataset=True, documents=None, documents_fraction=1.0, text_type=, computation_method=, word_topic_relatedness=, specificity_estimation=, max_num_out_of_topic_words=10, window=20, start_fit_iteration=0, fit_iteration_step=1, seed=11221963, verbose=False) +(dataset: Union[topicnet.cooking_machine.dataset.Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False)
          -

          Computes intratext coherence

          +

          Computes intratext coherence

          For each topic of topic model its distribution throughout document collection is observed. Hypothetically, the better the topic, the more often it is represented by long segments of words highly related to the topic. @@ -213,7 +221,7 @@

          Notes

          at the end of the training process (and not in the dependence of score on iteration), one should adjust start_fit_iteration and fit_iteration_step correspondingly. For example:

          -
          >>> # dataset = Dataset(...)
          +
          >>> # dataset = Dataset(...)
           >>> # topic_model = TopicModel(...)
           >>> num_iterations = 100
           >>> topic_model.custom_scores['intratext_coherence'] = IntratextCoherenceScore(
          @@ -221,9 +229,11 @@ 

          Notes

          >>> start_fit_iteration=num_iterations - 1 # last iteration: starting from zero >>> ) >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) -
          +

          -Source code + +Expand source code +
          class IntratextCoherenceScore(BaseScore):
               """Computes intratext coherence
           
          @@ -238,6 +248,7 @@ 

          Notes

          self, dataset: Union[Dataset, str], name: str = None, + should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, @@ -322,7 +333,7 @@

          Notes

          >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) """ # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self._keep_dataset = keep_dataset @@ -496,7 +507,7 @@

          Notes

          return score - def call(self, model: BaseModel) -> float: + def call(self, model: BaseModel, **kwargs) -> float: if (self._current_iteration - self._start_fit_iteration) % self._fit_iteration_step != 0: self._current_iteration += 1 @@ -840,21 +851,23 @@

          Ancestors

          Static methods

          -def load(path) +def load(path: str)
          -

          Parameters

          +

          Parameters

          path
           

          Returns

          -
          IntratextCoherenceScore
          +
          IntratextCoherenceScore
           
          -
          +
          -Source code + +Expand source code +
          @classmethod
           def load(cls, path: str):
               """
          @@ -888,11 +901,13 @@ 

          Returns

          Instance variables

          -
          var dataset
          +
          var datasetDataset
          -
          +
          -Source code + +Expand source code +
          @property
           def dataset(self) -> Dataset:
               return self._dataset
          @@ -902,12 +917,14 @@

          Instance variables

          Methods

          -def compute(self, model, topics=None, documents=None) +def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) -> Dict[str, Union[float, NoneType]]
          -
          +
          -Source code + +Expand source code +
          def compute(
                   self,
                   model: BaseModel,
          @@ -971,12 +988,14 @@ 

          Methods

          -def save(self, path) +def save(self, path: str) -> NoneType
          -
          +
          -Source code + +Expand source code +
          def save(self, path: str) -> None:
               dataset = self._dataset
               self._dataset = None
          @@ -1000,10 +1019,10 @@ 

          Inherited members

          class ScoreExample -(name=None, token_threshold=0.001) +(name: str = None, token_threshold: float = 0.001, should_compute: Callable[[int], bool] = None)
          -

          Example score that calculates +

          Example score that calculates average size of topic kernel across all topics. We inherit from BaseScore in order to have self.value property and self.update() method (the internal logic of TopicNet relies on them)

          @@ -1013,9 +1032,11 @@

          Parameters

          name of the score
          token_threshold : float
          what probabilities to take as token belonging to the topic
          -
          +
          -Source code + +Expand source code +
          class ScoreExample(BaseScore):
               """
               Example score that calculates
          @@ -1024,7 +1045,11 @@ 

          Parameters

          (the internal logic of TopicNet relies on them) """ - def __init__(self, name: str = None, token_threshold: float = 1e-3): + def __init__( + self, + name: str = None, + token_threshold: float = 1e-3, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -1035,11 +1060,11 @@

          Parameters

          what probabilities to take as token belonging to the topic """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.threshold = token_threshold - def call(self, model): + def call(self, model, **kwargs): """ Method that calculates the score @@ -1065,10 +1090,10 @@

          Ancestors

          Methods

          -def call(self, model) +def call(self, model, **kwargs)
          -

          Method that calculates the score

          +

          Method that calculates the score

          Parameters

          model : TopicModel
          @@ -1078,10 +1103,12 @@

          Returns

          score : float
          mean kernel size for all topics in the model
          -
          +
          -Source code -
          def call(self, model):
          +
          +Expand source code
          +
          +
          def call(self, model, **kwargs):
               """
               Method that calculates the score
           
          @@ -1113,10 +1140,10 @@ 

          Inherited members

          class SemanticRadiusScore -(batch_vectorizer, name=None) +(batch_vectorizer, name: str = None)
          -

          This score implements cluster semantic radius, described in paper +

          This score implements cluster semantic radius, described in paper 'Проверка гипотезы условной независимости для оценивания качества тематической кластеризации' by Rogozina A. At the core this score helps to discover topics uniformity. @@ -1127,9 +1154,11 @@

          Parameters

          Name of the score
          batch_vectorizer
           
          -
          +
          -Source code + +Expand source code +
          class SemanticRadiusScore(BaseScore):
               """
               This score implements cluster semantic radius, described in paper
          @@ -1209,7 +1238,7 @@ 

          Methods

          def call(self, model, max_sampled_document_len=None, sample_step=5, sample_size=3, alpha=0.1)
          -

          Parameters

          +

          Parameters

          model : TopicModel
           
          @@ -1225,9 +1254,11 @@

          Methods

          alpha : float
          (1 - alpha) quantile level, must be <= 1
          (Default value = 0.1)
          -
          +
          -Source code + +Expand source code +
          def call(self, model, max_sampled_document_len=None, sample_step=5, sample_size=3, alpha=0.1):
               """
           
          @@ -1321,7 +1352,7 @@ 

          diff --git a/docs/cooking_machine/models/scores_wrapper.html b/docs/cooking_machine/models/scores_wrapper.html index c3e1add..bb0418f 100644 --- a/docs/cooking_machine/models/scores_wrapper.html +++ b/docs/cooking_machine/models/scores_wrapper.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.scores_wrapper API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.scores_wrapper
          -Source code + +Expand source code +
          import artm
           import copy
           from collections.abc import Mapping
          @@ -82,9 +84,9 @@ 

          Module topicnet.cooking_machine.models.scores_wrapperClasses

          class ScoresWrapper -(topicnet_scores, artm_scores) +(topicnet_scores: Dict[str, topicnet.cooking_machine.models.base_score.BaseScore], artm_scores: artm.scores.Scores)
          -
          +
          -Source code + +Expand source code +
          class ScoresWrapper(Mapping):
               def __init__(self,
                            topicnet_scores: Dict[str, BaseScore],
          @@ -169,9 +173,9 @@ 

          Classes

          elif isinstance(score, BaseScore): if score._name is None: raise ValueError( - f'When using `model.scores.add(score)` method,' - f' one should specify score name parameter during score initialization.' - f' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' + 'When using `model.scores.add(score)` method,' + ' one should specify score name parameter during score initialization.' + ' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' ) self._topicnet_scores[score._name] = score @@ -204,12 +208,14 @@

          Ancestors

          Methods

          -def add(self, score) +def add(self, score: Union[topicnet.cooking_machine.models.base_score.BaseScore, artm.scores.BaseScore])
          -
          +
          -Source code + +Expand source code +
          def add(self, score: Union[BaseScore, artm.scores.BaseScore]):
               if isinstance(score, FrozenScore):
                   raise TypeError('FrozenScore is not supposed to be added to model')
          @@ -217,9 +223,9 @@ 

          Methods

          elif isinstance(score, BaseScore): if score._name is None: raise ValueError( - f'When using `model.scores.add(score)` method,' - f' one should specify score name parameter during score initialization.' - f' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' + 'When using `model.scores.add(score)` method,' + ' one should specify score name parameter during score initialization.' + ' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' ) self._topicnet_scores[score._name] = score @@ -266,7 +272,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/semantic_radius_score.html b/docs/cooking_machine/models/semantic_radius_score.html index 24c1fdf..e4a14ba 100644 --- a/docs/cooking_machine/models/semantic_radius_score.html +++ b/docs/cooking_machine/models/semantic_radius_score.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.semantic_radius_score API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.semantic_radius_s
          -Source code + +Expand source code +
          import artm
           
           import operator
          @@ -243,9 +245,11 @@ 

          Functions

          def calculate_n(model, batch_vectorizer)
          -

          Calculate all necessary statistics from batch. This may take some time.

          +

          Calculate all necessary statistics from batch. This may take some time.

          -Source code + +Expand source code +
          def calculate_n(model, batch_vectorizer):
               """
               Calculate all necessary statistics from batch. This may take some time.
          @@ -296,9 +300,11 @@ 

          Functions

          def cressie_reed_sampled(topic, ntdw_calc, ntd_calc, nwt, nt, gimel=-0.5)
          -

          Calculate Cressie-Reed divergence for sampled pseudo-document.

          +

          Calculate Cressie-Reed divergence for sampled pseudo-document.

          -Source code + +Expand source code +
          def cressie_reed_sampled(topic, ntdw_calc, ntd_calc, nwt, nt, gimel=-1/2):
               """
               Calculate Cressie-Reed divergence for sampled pseudo-document.
          @@ -326,9 +332,11 @@ 

          Functions

          def radii_for_ntd(ntd, regression_coeff)
          -
          +
          -Source code + +Expand source code +
          def radii_for_ntd(ntd, regression_coeff):
               return ntd.apply(lambda x: third_degree(x, *regression_coeff))
          @@ -337,9 +345,11 @@

          Functions

          def radii_vs_ntd(max_len, sample_step, sample_size, nwt, nt, alpha)
          -
          +
          -Source code + +Expand source code +
          def radii_vs_ntd(max_len, sample_step, sample_size, nwt, nt, alpha):
               regression_coeffs = []
               for topic in range(len(nt)):
          @@ -354,9 +364,11 @@ 

          Functions

          def radius_for_ntd(ntd, regression_coeff)
          -
          +
          -Source code + +Expand source code +
          def radius_for_ntd(ntd, regression_coeff):
               return third_degree(ntd, *regression_coeff)
          @@ -365,9 +377,11 @@

          Functions

          def radius_vs_ndt(topic, max_len, sample_step, sample_size, nwt, nt, alpha)
          -

          Calculate third degree approximation for radius vs ndt dependency.

          +

          Calculate third degree approximation for radius vs ndt dependency.

          -Source code + +Expand source code +
          def radius_vs_ndt(topic, max_len, sample_step, sample_size, nwt, nt, alpha):
               """
               Calculate third degree approximation for radius vs ndt dependency.
          @@ -393,9 +407,11 @@ 

          Functions

          def synthetic_doc_ntdw_and_ntd(doc_len, nwt)
          -

          Create synthetic document from nwt with specific doc_len.

          +

          Create synthetic document from nwt with specific doc_len.

          -Source code + +Expand source code +
          def synthetic_doc_ntdw_and_ntd(doc_len, nwt):
               """
               Create synthetic document from nwt with specific doc_len.
          @@ -416,9 +432,11 @@ 

          Functions

          def third_degree(x, a, b, c, d)
          -
          +
          -Source code + +Expand source code +
          def third_degree(x, a, b, c, d):
               return a + b * x + c * x ** 2 + d * x ** 3
          @@ -430,10 +448,10 @@

          Classes

          class SemanticRadiusScore -(batch_vectorizer, name=None) +(batch_vectorizer, name: str = None)
          -

          This score implements cluster semantic radius, described in paper +

          This score implements cluster semantic radius, described in paper 'Проверка гипотезы условной независимости для оценивания качества тематической кластеризации' by Rogozina A. At the core this score helps to discover topics uniformity. @@ -444,9 +462,11 @@

          Parameters

          Name of the score
          batch_vectorizer
           
          -
          +

          -Source code + +Expand source code +
          class SemanticRadiusScore(BaseScore):
               """
               This score implements cluster semantic radius, described in paper
          @@ -526,7 +546,7 @@ 

          Methods

          def call(self, model, max_sampled_document_len=None, sample_step=5, sample_size=3, alpha=0.1)
          -

          Parameters

          +

          Parameters

          model : TopicModel
           
          @@ -542,9 +562,11 @@

          Methods

          alpha : float
          (1 - alpha) quantile level, must be <= 1
          (Default value = 0.1)
          -
          +
          -Source code + +Expand source code +
          def call(self, model, max_sampled_document_len=None, sample_step=5, sample_size=3, alpha=0.1):
               """
           
          @@ -632,7 +654,7 @@ 

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/thetaless_regularizer.html b/docs/cooking_machine/models/thetaless_regularizer.html index cef6a37..9288ffd 100644 --- a/docs/cooking_machine/models/thetaless_regularizer.html +++ b/docs/cooking_machine/models/thetaless_regularizer.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.thetaless_regularizer API documentation - - + + @@ -21,17 +21,167 @@

          Module topicnet.cooking_machine.models.thetaless_regular
          -Source code + +Expand source code +
          import numpy as np
          -from numba import jit
          +import os
          +import pandas as pd
           import scipy.sparse
          +import warnings
          +
          +from numba import jit
          +
          +import artm
           
           from .base_regularizer import BaseRegularizer
          +from ..dataset import Dataset
          +
          +
          +# TODO: move this to BigARTM
          +# ==================================
          +
          +FIELDS = 'token class_id token_value token_tf token_df'.split()
          +
          +
          +def artm_dict2df(artm_dict):
          +    """
          +    :Description: converts the BigARTM dictionary of the collection
          +        to the pandas.DataFrame.
          +        This is approximately equivalent to the dictionary.save_text()
          +        but has no I/O overhead
          +
          +    """
          +    dictionary_data = artm_dict._master.get_dictionary(artm_dict._name)
          +    dict_pandas = {field: getattr(dictionary_data, field)
          +                   for field in FIELDS}
          +    return pd.DataFrame(dict_pandas)
          +
          +# ==================================
           
           
           EPS = 1e-20
           
           
          +# TODO: is there a better way to do this?
          +def obtain_token2id(dataset: Dataset):
          +    """
          +    Allows one to obtain the mapping from token to the artm.dictionary id of that token
          +    (useful for low-level operations such as reading batches manually)
          +
          +    Returns
          +    -------
          +    dict:
          +        maps (token, class_id) to integer (corresponding to the row of Phi / dictionary id)
          +
          +    """
          +    df = artm_dict2df(dataset.get_dictionary())
          +    df_inverted_index = df[['token', 'class_id']].reset_index().set_index(['token', 'class_id'])
          +
          +    return df_inverted_index.to_dict()['index']
          +
          +
          +def dataset2sparse_matrix(dataset, modality, modalities_to_use=None):
          +    """
          +    Builds a sparse matrix from batch_vectorizer linked to the Dataset
          +
          +    If you need an inverse mapping:
          +
          +    >>> d = sparse_n_dw_matrix.todok()  # convert to dictionary of keys format
          +    >>> dict_of_csr = dict(d.items())
          +
          +    Parameters
          +    ----------
          +    dataset: Dataset
          +    modality: str
          +        the remaining modalities will be ignored
          +        (their occurrences will be replaced with zeros, but they will continue to exist)
          +    modalities_to_use: iterable
          +        a set of modalities the underlying topic model is using (this is about topic model,
          +        not regularizer; this parameter ensures that the shapes of n_dw matrix and actual
          +        Phi matrix match).
          +
          +        The tokens outside of this list will be discarded utterly
          +        (the resulting matrix will have no entries corresponding to them)
          +
          +        For artm.ARTM() models, you need to pass whatever is inside class_ids;
          +        while TopicModel usually requires this to be set inside modalities_to_use.
          +
          +        If you hadn't explicitly listed any modalities yet, you probably could
          +        leave this argument as None.
          +
          +        If you use a single modality, wrap it into a list (e.g.['@word'])
          +
          +    Returns
          +    -------
          +    n_dw_matrix: scipy.sparse.csr_matrix  
          +        The matrix of document-word occurrences.  
          +        `n_dw` is a number of the occurrences of the word `w` in the document `d`  
          +        this matrix determines the dependence between the Theta and Phi matrices  
          +        (Phi is the result of one iteration of the ARTM's EM algorihtm  
          +        with uniform theta initialization and `n_dw` matrix of the document-word occurrences)  
          +    """  # noqa: W291
          +    token2id = obtain_token2id(dataset)
          +
          +    batch_vectorizer = dataset.get_batch_vectorizer()
          +
          +    return _batch_vectorizer2sparse_matrix(
          +        batch_vectorizer, token2id, modality, modalities_to_use
          +    )
          +
          +
          +def _batch_vectorizer2sparse_matrix(batch_vectorizer, token2id, modality, modalities_to_use=None):
          +    """
          +    """
          +    theta_column_naming = 'id'  # scipy sparse matrix doesn't support non-integer indices
          +    matrix_row, matrix_col, matrix_data = [], [], []
          +
          +    for batch_id in range(len(batch_vectorizer._batches_list)):
          +        batch_name = batch_vectorizer._batches_list[batch_id]._filename
          +        batch = artm.messages.Batch()
          +        with open(batch_name, "rb") as f:
          +            batch.ParseFromString(f.read())
          +
          +        for item_id in range(len(batch.item)):
          +            item = batch.item[item_id]
          +            theta_item_id = getattr(item, theta_column_naming)
          +
          +            for local_token_id, token_weight in zip(item.token_id, item.token_weight):
          +                token_class_id = batch.class_id[local_token_id]
          +                token = batch.token[local_token_id]
          +                if (token, token_class_id) not in token2id:
          +                    # probably dictionary was filtered
          +                    continue
          +                if modalities_to_use and token_class_id not in modalities_to_use:
          +                    continue
          +                if token_class_id != modality:
          +                    # we still need these tokens,
          +                    # shapes of n_dw matrix and actual Phi matrix should be in sync.
          +                    # this will be changed to zero at the end
          +                    token_weight = np.nan
          +                token_id = token2id[(token, token_class_id)]
          +                matrix_row.append(theta_item_id)
          +                matrix_col.append(token_id)
          +                matrix_data.append(token_weight)
          +
          +    sparse_n_dw_matrix = scipy.sparse.csr_matrix(
          +        (matrix_data, (matrix_row, matrix_col)),
          +    )
          +    # remove the columns whose all elements are zero
          +    # (i.e. tokens which are of different modalities)
          +    # and renumber index (fill any "holes")
          +    # this is needed to be in sync with artm dictionary after filtering elements out
          +    # (they need to have the same shape)
          +    ind = sparse_n_dw_matrix.sum(axis=0)
          +    nonzeros = np.ravel(ind > 0)
          +    sparse_n_dw_matrix = sparse_n_dw_matrix[:, nonzeros]
          +
          +    # re-encode values to transform NaNs to explicitly stored zeros
          +    sparse_n_dw_matrix.data = np.nan_to_num(sparse_n_dw_matrix.data)
          +
          +    return sparse_n_dw_matrix
          +
          +
           @jit(nopython=True)
           def memory_efficient_inner1d(fst_arr, fst_indices, snd_arr, snd_indices):
               """
          @@ -116,6 +266,7 @@ 

          Module topicnet.cooking_machine.models.thetaless_regular # set rows where sum of row is small to uniform res[np.sum(res, axis=1) < EPS, :] = 1. res /= np.sum(res, axis=1)[:, np.newaxis] + return res @@ -137,40 +288,63 @@

          Module topicnet.cooking_machine.models.thetaless_regular class ThetalessRegularizer(BaseRegularizer): - def __init__(self, name, tau, n_dw_matrix): + def __init__(self, name, tau, modality, dataset: Dataset): """ - Creates a node in the graph with the given args and kwargs. + A regularizer based on a "thetaless" topic model inference + + Note: this implementation stores sparse `n_dw` matrix in memory, + so this is not particularly memory- and space-efficient for huge datasets Parameters ---------- name: str name of the regularizer tau: Number - fictive parameter it's not used, just passed to the parent conctructor - n_dw_matrix: scipy.sparse.csr_matrix - The matrix of document-word occurrences - n_dw is a number of the occurrences of the word w in the document d - this matrix determines the dependence between the Theta and Phi matrices - (Phi is the result of one iteration of the ARTM's EM algorihtm - with uniform theta initialization and n_dw matrix of the document-word occurrences) - """ + according to the math, `tau` should be set to 1 (to correctly emulate a different + inference process). But you do you, it's not like there's a regularizer + police or something. + modality: str + name of modality on which the inference should be based + dataset + will be transformed to n_dw_matrix + """ # noqa: W291 super().__init__(name, tau) - self.n_dw_matrix = n_dw_matrix + + self.modality = modality + self.modalities_to_use = None + self.n_dw_matrix = None + + self.token2id = obtain_token2id(dataset) + self._batches_path = os.path.join(dataset._internals_folder_path, "batches") + + def _initialize_matrices(self, batch_vectorizer, token2id): + self.n_dw_matrix = _batch_vectorizer2sparse_matrix( + batch_vectorizer, token2id, self.modality, self.modalities_to_use + ) self.B = scipy.sparse.csr_matrix( ( - 1. * n_dw_matrix.data / calc_docsizes(n_dw_matrix), - n_dw_matrix.indices, - n_dw_matrix.indptr + 1. * self.n_dw_matrix.data / calc_docsizes(self.n_dw_matrix), + self.n_dw_matrix.indices, + self.n_dw_matrix.indptr ), - shape=n_dw_matrix.shape + shape=self.n_dw_matrix.shape ).tocsc() - self.docptr = get_docptr(n_dw_matrix) - self.wordptr = n_dw_matrix.indices + self.docptr = get_docptr(self.n_dw_matrix) + self.wordptr = self.n_dw_matrix.indices def grad(self, pwt, nwt): phi_matrix_tr = np.array(pwt) phi_matrix = phi_matrix_tr.T phi_rev_matrix = get_prob_matrix_by_counters(phi_matrix_tr) + + if self.n_dw_matrix.shape[1] != phi_rev_matrix.shape[0]: + raise ValueError( + f"Thetaless regularizer has prepared {self.n_dw_matrix.shape} n_dw matrix," + f" but was passed {phi_rev_matrix.T.shape} Phi matrix containing different" + f" number of tokens ({self.n_dw_matrix.shape[1]} != {phi_rev_matrix.shape[0]})" + f"\n(Are modalities the same?)" + ) + theta_matrix = get_prob_matrix_by_counters( self.n_dw_matrix.dot(phi_rev_matrix) ) @@ -188,7 +362,27 @@

          Module topicnet.cooking_machine.models.thetaless_regular tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return n_tw.T - nwt

          + return self.tau * (n_tw.T - nwt) + + def attach(self, model): + """ + + Parameters + ---------- + model : ARTM model + necessary to apply master component + """ + if model.num_document_passes != 1: + warnings.warn( + f"num_document_passes is equal to {model.num_document_passes}, but it" + f" should be set to {1} to correctly emulate a thetaless inference process" + ) + + self.modalities_to_use = model.class_ids.keys() + bv = artm.BatchVectorizer(data_path=self._batches_path, data_format='batches') + self._initialize_matrices(bv, self.token2id) + + self._model = model

          @@ -198,13 +392,41 @@

          Module topicnet.cooking_machine.models.thetaless_regular

          Functions

          +
          +def artm_dict2df(artm_dict) +
          +
          +

          :Description: converts the BigARTM dictionary of the collection +to the pandas.DataFrame. +This is approximately equivalent to the dictionary.save_text() +but has no I/O overhead

          +
          + +Expand source code + +
          def artm_dict2df(artm_dict):
          +    """
          +    :Description: converts the BigARTM dictionary of the collection
          +        to the pandas.DataFrame.
          +        This is approximately equivalent to the dictionary.save_text()
          +        but has no I/O overhead
          +
          +    """
          +    dictionary_data = artm_dict._master.get_dictionary(artm_dict._name)
          +    dict_pandas = {field: getattr(dictionary_data, field)
          +                   for field in FIELDS}
          +    return pd.DataFrame(dict_pandas)
          +
          +
          def calc_A_matrix(n_dw_matrix, theta_matrix, docptr, phi_matrix_tr, wordptr)
          -
          +
          -Source code + +Expand source code +
          def calc_A_matrix(
               n_dw_matrix, theta_matrix, docptr, phi_matrix_tr, wordptr
           ):
          @@ -226,9 +448,11 @@ 

          Functions

          def calc_docsizes(n_dw_matrix)
          -
          +
          -Source code + +Expand source code +
          def calc_docsizes(n_dw_matrix):
               D, _ = n_dw_matrix.shape
               docsizes = []
          @@ -242,22 +466,117 @@ 

          Functions

          return np.array(docsizes)
          +
          +def dataset2sparse_matrix(dataset, modality, modalities_to_use=None) +
          +
          +

          Builds a sparse matrix from batch_vectorizer linked to the Dataset

          +

          If you need an inverse mapping:

          +
          >>> d = sparse_n_dw_matrix.todok()  # convert to dictionary of keys format
          +>>> dict_of_csr = dict(d.items())
          +
          +

          Parameters

          +
          +
          dataset : Dataset
          +
           
          +
          modality : str
          +
          the remaining modalities will be ignored +(their occurrences will be replaced with zeros, but they will continue to exist)
          +
          modalities_to_use : iterable
          +
          +

          a set of modalities the underlying topic model is using (this is about topic model, +not regularizer; this parameter ensures that the shapes of n_dw matrix and actual +Phi matrix match).

          +

          The tokens outside of this list will be discarded utterly +(the resulting matrix will have no entries corresponding to them)

          +

          For artm.ARTM() models, you need to pass whatever is inside class_ids; +while TopicModel usually requires this to be set inside modalities_to_use.

          +

          If you hadn't explicitly listed any modalities yet, you probably could +leave this argument as None.

          +

          If you use a single modality, wrap it into a list (e.g.['@word'])

          +
          +
          +

          Returns

          +
          +
          n_dw_matrix : scipy.sparse.csr_matrix +
          +
          The matrix of document-word occurrences.
          +n_dw is a number of the occurrences of the word w in the document d
          +this matrix determines the dependence between the Theta and Phi matrices
          +(Phi is the result of one iteration of the ARTM's EM algorihtm
          +with uniform theta initialization and n_dw matrix of the document-word occurrences)
          +
          +
          + +Expand source code + +
          def dataset2sparse_matrix(dataset, modality, modalities_to_use=None):
          +    """
          +    Builds a sparse matrix from batch_vectorizer linked to the Dataset
          +
          +    If you need an inverse mapping:
          +
          +    >>> d = sparse_n_dw_matrix.todok()  # convert to dictionary of keys format
          +    >>> dict_of_csr = dict(d.items())
          +
          +    Parameters
          +    ----------
          +    dataset: Dataset
          +    modality: str
          +        the remaining modalities will be ignored
          +        (their occurrences will be replaced with zeros, but they will continue to exist)
          +    modalities_to_use: iterable
          +        a set of modalities the underlying topic model is using (this is about topic model,
          +        not regularizer; this parameter ensures that the shapes of n_dw matrix and actual
          +        Phi matrix match).
          +
          +        The tokens outside of this list will be discarded utterly
          +        (the resulting matrix will have no entries corresponding to them)
          +
          +        For artm.ARTM() models, you need to pass whatever is inside class_ids;
          +        while TopicModel usually requires this to be set inside modalities_to_use.
          +
          +        If you hadn't explicitly listed any modalities yet, you probably could
          +        leave this argument as None.
          +
          +        If you use a single modality, wrap it into a list (e.g.['@word'])
          +
          +    Returns
          +    -------
          +    n_dw_matrix: scipy.sparse.csr_matrix  
          +        The matrix of document-word occurrences.  
          +        `n_dw` is a number of the occurrences of the word `w` in the document `d`  
          +        this matrix determines the dependence between the Theta and Phi matrices  
          +        (Phi is the result of one iteration of the ARTM's EM algorihtm  
          +        with uniform theta initialization and `n_dw` matrix of the document-word occurrences)  
          +    """  # noqa: W291
          +    token2id = obtain_token2id(dataset)
          +
          +    batch_vectorizer = dataset.get_batch_vectorizer()
          +
          +    return _batch_vectorizer2sparse_matrix(
          +        batch_vectorizer, token2id, modality, modalities_to_use
          +    )
          +
          +
          def get_docptr(n_dw_matrix)
          -

          Parameters

          +

          Parameters

          -
          n_dw_matrix : array-like
          +
          n_dw_matrix : array-like
           

          Returns

          np.array
          row indices for the provided matrix
          -
          +
          -Source code + +Expand source code +
          def get_docptr(n_dw_matrix):
               """
               Parameters
          @@ -276,9 +595,11 @@ 

          Returns

          def get_prob_matrix_by_counters(counters, inplace=False)
          -
          +
          -Source code + +Expand source code +
          def get_prob_matrix_by_counters(counters, inplace=False):
               if inplace:
                   res = counters
          @@ -288,6 +609,7 @@ 

          Returns

          # set rows where sum of row is small to uniform res[np.sum(res, axis=1) < EPS, :] = 1. res /= np.sum(res, axis=1)[:, np.newaxis] + return res
          @@ -295,15 +617,15 @@

          Returns

          def memory_efficient_inner1d(fst_arr, fst_indices, snd_arr, snd_indices)
          -

          Parameters

          +

          Parameters

          -
          fst_arr : array-like
          +
          fst_arr : array-like
          2d array, shape is N x T
          -
          fst_indices : array-like
          +
          fst_indices : array-like
          indices of the rows in fst_arr
          -
          snd_arr : array-like
          +
          snd_arr : array-like
          2d array, shape is M x T
          -
          snd_indices : array-like
          +
          snd_indices : array-like
          indices of the rows in fst_arr

          Returns

          @@ -314,9 +636,11 @@

          Returns

          sum(fst_arr[i, k] * snd_arr[j, k] for k in 0..T) for i, j in fst_indices, snd_indices ])
          -
          +
          -Source code + +Expand source code +
          @jit(nopython=True)
           def memory_efficient_inner1d(fst_arr, fst_indices, snd_arr, snd_indices):
               """
          @@ -355,6 +679,38 @@ 

          Returns

          return result

          +
          +def obtain_token2id(dataset: Dataset) +
          +
          +

          Allows one to obtain the mapping from token to the artm.dictionary id of that token +(useful for low-level operations such as reading batches manually)

          +

          Returns

          +
          +
          dict:
          +
          maps (token, class_id) to integer (corresponding to the row of Phi / dictionary id)
          +
          +
          + +Expand source code + +
          def obtain_token2id(dataset: Dataset):
          +    """
          +    Allows one to obtain the mapping from token to the artm.dictionary id of that token
          +    (useful for low-level operations such as reading batches manually)
          +
          +    Returns
          +    -------
          +    dict:
          +        maps (token, class_id) to integer (corresponding to the row of Phi / dictionary id)
          +
          +    """
          +    df = artm_dict2df(dataset.get_dictionary())
          +    df_inverted_index = df[['token', 'class_id']].reset_index().set_index(['token', 'class_id'])
          +
          +    return df_inverted_index.to_dict()['index']
          +
          +
          @@ -362,61 +718,88 @@

          Classes

          class ThetalessRegularizer -(name, tau, n_dw_matrix) +(name, tau, modality, dataset: Dataset)
          -

          Base regularizer class to construct custom regularizers.

          -

          Creates a node in the graph with the given args and kwargs.

          +

          Base regularizer class to construct custom regularizers.

          +

          A regularizer based on a "thetaless" topic model inference

          +

          Note: this implementation stores sparse n_dw matrix in memory, +so this is not particularly memory- and space-efficient for huge datasets

          Parameters

          name : str
          name of the regularizer
          tau : Number
          -
          fictive parameter it's not used, just passed to the parent conctructor
          -
          n_dw_matrix : scipy.sparse.csr_matrix
          -
          The matrix of document-word occurrences -n_dw is a number of the occurrences of the word w in the document d -this matrix determines the dependence between the Theta and Phi matrices -(Phi is the result of one iteration of the ARTM's EM algorihtm -with uniform theta initialization and n_dw matrix of the document-word occurrences)
          -
          +
          according to the math, tau should be set to 1 (to correctly emulate a different
          +inference process). But you do you, it's not like there's a regularizer
          +police or something.
          +
          modality : str
          +
          name of modality on which the inference should be based
          +
          dataset
          +
          will be transformed to n_dw_matrix
          +
          -Source code + +Expand source code +
          class ThetalessRegularizer(BaseRegularizer):
          -    def __init__(self, name, tau, n_dw_matrix):
          +    def __init__(self, name, tau, modality, dataset: Dataset):
                   """
          -        Creates a node in the graph with the given args and kwargs.
          +        A regularizer based on a "thetaless" topic model inference
          +
          +        Note: this implementation stores sparse `n_dw` matrix in memory,
          +        so this is not particularly memory- and space-efficient for huge datasets
           
                   Parameters
                   ----------
                   name: str
                       name of the regularizer
                   tau: Number
          -            fictive parameter it's not used, just passed to the parent conctructor
          -        n_dw_matrix: scipy.sparse.csr_matrix
          -            The matrix of document-word occurrences
          -            n_dw is a number of the occurrences of the word w in the document d
          -            this matrix determines the dependence between the Theta and Phi matrices
          -            (Phi is the result of one iteration of the ARTM's EM algorihtm
          -            with uniform theta initialization and n_dw matrix of the document-word occurrences)
          -        """
          +            according to the math, `tau` should be set to 1 (to correctly emulate a different  
          +            inference process). But you do you, it's not like there's a regularizer  
          +            police or something.  
          +        modality: str
          +            name of modality on which the inference should be based
          +        dataset
          +            will be transformed to n_dw_matrix
          +        """  # noqa: W291
                   super().__init__(name, tau)
          -        self.n_dw_matrix = n_dw_matrix
          +
          +        self.modality = modality
          +        self.modalities_to_use = None
          +        self.n_dw_matrix = None
          +
          +        self.token2id = obtain_token2id(dataset)
          +        self._batches_path = os.path.join(dataset._internals_folder_path, "batches")
          +
          +    def _initialize_matrices(self, batch_vectorizer, token2id):
          +        self.n_dw_matrix = _batch_vectorizer2sparse_matrix(
          +            batch_vectorizer, token2id, self.modality, self.modalities_to_use
          +        )
                   self.B = scipy.sparse.csr_matrix(
                       (
          -                1. * n_dw_matrix.data / calc_docsizes(n_dw_matrix),
          -                n_dw_matrix.indices,
          -                n_dw_matrix.indptr
          +                1. * self.n_dw_matrix.data / calc_docsizes(self.n_dw_matrix),
          +                self.n_dw_matrix.indices,
          +                self.n_dw_matrix.indptr
                       ),
          -            shape=n_dw_matrix.shape
          +            shape=self.n_dw_matrix.shape
                   ).tocsc()
          -        self.docptr = get_docptr(n_dw_matrix)
          -        self.wordptr = n_dw_matrix.indices
          +        self.docptr = get_docptr(self.n_dw_matrix)
          +        self.wordptr = self.n_dw_matrix.indices
           
               def grad(self, pwt, nwt):
                   phi_matrix_tr = np.array(pwt)
                   phi_matrix = phi_matrix_tr.T
                   phi_rev_matrix = get_prob_matrix_by_counters(phi_matrix_tr)
          +
          +        if self.n_dw_matrix.shape[1] != phi_rev_matrix.shape[0]:
          +            raise ValueError(
          +                f"Thetaless regularizer has prepared {self.n_dw_matrix.shape} n_dw matrix,"
          +                f" but was passed {phi_rev_matrix.T.shape} Phi matrix containing different"
          +                f" number of tokens ({self.n_dw_matrix.shape[1]} != {phi_rev_matrix.shape[0]})"
          +                f"\n(Are modalities the same?)"
          +            )
          +
                   theta_matrix = get_prob_matrix_by_counters(
                       self.n_dw_matrix.dot(phi_rev_matrix)
                   )
          @@ -434,7 +817,27 @@ 

          Parameters

          tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return n_tw.T - nwt
          + return self.tau * (n_tw.T - nwt) + + def attach(self, model): + """ + + Parameters + ---------- + model : ARTM model + necessary to apply master component + """ + if model.num_document_passes != 1: + warnings.warn( + f"num_document_passes is equal to {model.num_document_passes}, but it" + f" should be set to {1} to correctly emulate a thetaless inference process" + ) + + self.modalities_to_use = model.class_ids.keys() + bv = artm.BatchVectorizer(data_path=self._batches_path, data_format='batches') + self._initialize_matrices(bv, self.token2id) + + self._model = model

          Ancestors

            @@ -446,13 +849,24 @@

            Methods

            def grad(self, pwt, nwt)
            -
            +
            -Source code + +Expand source code +
            def grad(self, pwt, nwt):
                 phi_matrix_tr = np.array(pwt)
                 phi_matrix = phi_matrix_tr.T
                 phi_rev_matrix = get_prob_matrix_by_counters(phi_matrix_tr)
            +
            +    if self.n_dw_matrix.shape[1] != phi_rev_matrix.shape[0]:
            +        raise ValueError(
            +            f"Thetaless regularizer has prepared {self.n_dw_matrix.shape} n_dw matrix,"
            +            f" but was passed {phi_rev_matrix.T.shape} Phi matrix containing different"
            +            f" number of tokens ({self.n_dw_matrix.shape[1]} != {phi_rev_matrix.shape[0]})"
            +            f"\n(Are modalities the same?)"
            +        )
            +
                 theta_matrix = get_prob_matrix_by_counters(
                     self.n_dw_matrix.dot(phi_rev_matrix)
                 )
            @@ -470,7 +884,7 @@ 

            Methods

            tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return n_tw.T - nwt
            + return self.tau * (n_tw.T - nwt)
          @@ -499,11 +913,14 @@

          Index

        • Functions

        • Classes

          @@ -520,7 +937,7 @@

          -

          Generated by pdoc 0.6.3.

          +

          Generated by pdoc 0.8.1.

          diff --git a/docs/cooking_machine/models/topic_model.html b/docs/cooking_machine/models/topic_model.html index 23601f0..8d92d6b 100644 --- a/docs/cooking_machine/models/topic_model.html +++ b/docs/cooking_machine/models/topic_model.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.topic_model API documentation - - + + @@ -21,7 +21,9 @@

          Module topicnet.cooking_machine.models.topic_model
          -Source code + +Expand source code +
          import artm
           import dill
           import glob
          @@ -33,7 +35,6 @@ 

          Module topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_model

          + return result.set_index(["model_id", "regularizer_name"]).sort_values(by="regularizer_name") + + def get_regularizer( + self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer]: + """ + Retrieves the regularizer specified, no matter is it custom or "classic" + + Returns + ------- + regularizer + + """ + # TODO: RegularizersWrapper? + + if reg_name in self.custom_regularizers: + return self.custom_regularizers[reg_name] + elif reg_name in self._model.regularizers.data: + return self._model.regularizers.data[reg_name] + else: + raise KeyError( + f'There is no such regularizer "{reg_name}"' + f' among custom and ARTM regularizers!' + )

  • @@ -866,14 +937,14 @@

    Classes

    class TopicModel -(artm_model=None, model_id=None, parent_model_id=None, data_path=None, description=None, experiment=None, callbacks=None, custom_scores=None, custom_regularizers=None, *args, **kwargs) +(artm_model: artm.artm_model.ARTM = None, model_id: str = None, parent_model_id: str = None, data_path: str = None, description: List[Dict[str, Any]] = None, experiment=None, callbacks: List[ControllerAgent] = None, custom_scores: Dict[str, topicnet.cooking_machine.models.base_score.BaseScore] = None, custom_regularizers: Dict[str, topicnet.cooking_machine.models.base_regularizer.BaseRegularizer] = None, *args, **kwargs)
    -

    Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

    +

    Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

    Initialize stage, also used for loading previously saved experiments.

    Parameters

    -
    artm_model : artm model or None
    +
    artm_model : artm model or None
    model to use, None if you want to create model (Default value = None)
    model_id : str
    model id (Default value = None)
    @@ -885,7 +956,7 @@

    Parameters

    description of the model (Default value = None)
    experiment : Experiment
    the experiment to which the model is bound (Default value = None)
    -
    callbacks : list of objects with invoke() method
    +
    callbacks : list of objects with invoke() method
    function called inside _fit which alters model parameters mainly used for fancy regularizer coefficients manipulation
    custom_scores : dict
    @@ -893,9 +964,11 @@

    Parameters

    (score class with functionality like those of BaseScore)
    custom_regularizers : dict
    dictionary with regularizer names as keys and regularizer classes as values
    -
    +
    -Source code + +Expand source code +
    class TopicModel(BaseModel):
         """
         Topic Model contains artm model and all necessary information: scores, training pipeline, etc.
    @@ -911,7 +984,7 @@ 

    Parameters

    experiment=None, callbacks: List[ControllerAgent] = None, custom_scores: Dict[str, BaseScore] = None, - custom_regularizers: Dict[str, artm.regularizers.BaseRegularizer] = None, + custom_regularizers: Dict[str, BaseRegularizer] = None, *args, **kwargs): """ Initialize stage, also used for loading previously saved experiments. @@ -1028,24 +1101,13 @@

    Parameters

    return score_values - def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): - """ - - Parameters - ---------- - dataset_trainable : BatchVectorizer - Data for model fit - num_iterations : int - Amount of fit steps - custom_regularizers : dict of BaseRegularizer - Regularizers to apply to model - - """ + def _prepare_custom_regularizers(self, custom_regularizers): if custom_regularizers is None: custom_regularizers = dict() all_custom_regularizers = deepcopy(custom_regularizers) all_custom_regularizers.update(self.custom_regularizers) + base_regularizers_name, base_regularizers_tau = None, None if len(all_custom_regularizers) != 0: for regularizer in all_custom_regularizers.values(): @@ -1056,7 +1118,29 @@

    Parameters

    base_regularizers_tau = [regularizer.tau for regularizer in self._model.regularizers.data.values()] + return base_regularizers_name, base_regularizers_tau, all_custom_regularizers + + def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): + """ + + Parameters + ---------- + dataset_trainable : BatchVectorizer + Data for model fit + num_iterations : int + Amount of fit steps + custom_regularizers : dict of BaseRegularizer + Regularizers to apply to model + + """ + (base_regularizers_name, + base_regularizers_tau, + all_custom_regularizers) = self._prepare_custom_regularizers(custom_regularizers) + for cur_iter in range(num_iterations): + precomputed_data = dict() + iter_is_last = cur_iter == num_iterations - 1 + self._model.fit_offline(batch_vectorizer=dataset_trainable, num_collection_passes=1) @@ -1068,9 +1152,26 @@

    Parameters

    for name, custom_score in self.custom_scores.items(): try: - score = custom_score.call(self) + should_compute_now = iter_is_last or custom_score._should_compute(cur_iter) + + if not should_compute_now: + continue + + # TODO: this check is probably should be refined somehow... + # what if some new parameter added to BaseScore.call -> new check?.. + call_parameters = signature(custom_score.call).parameters + + # if-else instead of try-catch: to speed up + if (BaseScore._PRECOMPUTED_DATA_PARAMETER_NAME not in call_parameters + and not any(str(p).startswith('**') for p in call_parameters.values())): + + score = custom_score.call(self) + else: + score = custom_score.call(self, precomputed_data=precomputed_data) + custom_score.update(score) self._model.score_tracker[name] = custom_score + except AttributeError: # TODO: means no "call" attribute? raise AttributeError(f'Score {name} doesn\'t have a desired attribute') @@ -1168,17 +1269,32 @@

    Parameters

    model_save_path = self.model_default_save_path for regularizer_name, regularizer_object in self.custom_regularizers.items(): - try: - save_path = os.path.join(model_save_path, regularizer_name + '.rd') - with open(save_path, 'wb') as reg_f: - dill.dump(regularizer_object, reg_f) - except (TypeError, AttributeError): + # If not do this, there may be problems with pickling: + # `model` is an ARTM-C-like thing, and it may cause problems + # This is safe, because `model` appears in attach(), + # which is called before each iteration + # P.S. and the `model` itself may be needed for a regularizer inside `grad()` + regularizer_object._model = None + + managed_to_pickle = False + + for (pickler, extension) in zip([dill, pickle], ['.rd', '.rp']): + save_path = os.path.join(model_save_path, regularizer_name + extension) + try: - save_path = os.path.join(model_save_path, regularizer_name + '.rp') with open(save_path, 'wb') as reg_f: - pickle.dump(regularizer_object, reg_f) + pickler.dump(regularizer_object, reg_f) except (TypeError, AttributeError): - warnings.warn(f'Cannot save {regularizer_name} regularizer.') + if os.path.isfile(save_path): + os.remove(save_path) + else: + managed_to_pickle = True + + if managed_to_pickle: + break + + if not managed_to_pickle: + warnings.warn(f'Cannot save {regularizer_name} regularizer!') def save(self, model_save_path=None, @@ -1367,6 +1483,7 @@

    Parameters

    class_ids = [class_ids] class_ids_iter = class_ids or self._model.class_ids # TODO: this workaround seems to be a correct solution to this problem + # maybe the next for-loop could be replaced with these three lines if not class_ids_iter: valid_model_name = self._model.model_pwt info = self._model.master.get_phi_info(valid_model_name) @@ -1680,7 +1797,29 @@

    Parameters

    result = pd.DataFrame( columns=["model_id", "regularizer_name", "tau", "gamma", "class_ids"], data=data ) - return result.set_index(["model_id", "regularizer_name"]).sort_values(by="regularizer_name")
    + return result.set_index(["model_id", "regularizer_name"]).sort_values(by="regularizer_name") + + def get_regularizer( + self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer]: + """ + Retrieves the regularizer specified, no matter is it custom or "classic" + + Returns + ------- + regularizer + + """ + # TODO: RegularizersWrapper? + + if reg_name in self.custom_regularizers: + return self.custom_regularizers[reg_name] + elif reg_name in self._model.regularizers.data: + return self._model.regularizers.data[reg_name] + else: + raise KeyError( + f'There is no such regularizer "{reg_name}"' + f' among custom and ARTM regularizers!' + )

    Ancestors

      @@ -1689,7 +1828,6 @@

      Ancestors

      Subclasses

      Static methods

      @@ -1697,7 +1835,7 @@

      Static methods

      def load(path, experiment=None)
      -

      Loads the model.

      +

      Loads the model.

      Parameters

      path : str
      @@ -1707,11 +1845,13 @@

      Parameters

      Returns

      -
      TopicModel
      +
      TopicModel
       
      -
      +
      -Source code + +Expand source code +
      @staticmethod
       def load(path, experiment=None):
           """
      @@ -1782,14 +1922,16 @@ 

      Instance variables

      var all_regularizers
      -

      Gets all regularizers with custom regularizers.

      +

      Gets all regularizers with custom regularizers.

      Returns

      regularizers_dict : dict
      dict with artm.regularizer and BaseRegularizer instances
      -
      +
      -Source code + +Expand source code +
      @property
       def all_regularizers(self):
           """
      @@ -1811,9 +1953,11 @@ 

      Returns

    var background_topics
    -
    +
    -Source code + +Expand source code +
    @property
     def background_topics(self):
         return self.select_topics(["background", "bcg"])
    @@ -1821,9 +1965,11 @@

    Returns

    var class_ids
    -
    +
    -Source code + +Expand source code +
    @property
     def class_ids(self):
         """ """
    @@ -1832,9 +1978,11 @@ 

    Returns

    var description
    -
    +
    -Source code + +Expand source code +
    @property
     def description(self):
         """ """
    @@ -1843,9 +1991,11 @@ 

    Returns

    var regularizers
    -

    Gets regularizers from model.

    +

    Gets regularizers from model.

    -Source code + +Expand source code +
    @property
     def regularizers(self):
         """
    @@ -1855,16 +2005,18 @@ 

    Returns

    return self._model.regularizers
    -
    var scores
    +
    var scores : Dict[str, List[float]]
    -

    Gets score values by name.

    +

    Gets score values by name.

    Returns

    -
    dict : string -> list
    +
    dict : string -> list
    dictionary with scores and corresponding values
    -
    +
    -Source code + +Expand source code +
    @property
     def scores(self) -> Dict[str, List[float]]:
         """
    @@ -1883,9 +2035,11 @@ 

    Returns

    var specific_topics
    -
    +
    -Source code + +Expand source code +
    @property
     def specific_topics(self):
         return self.select_topics(["background", "bcg"], invert=True)
    @@ -1898,7 +2052,7 @@

    Methods

    def clone(self, model_id=None)
    -

    Creates a copy of the model except model_id.

    +

    Creates a copy of the model except model_id.

    Parameters

    model_id : str
    @@ -1906,11 +2060,13 @@

    Parameters

    Returns

    -
    TopicModel
    +
    TopicModel
     
    -
    +
    -Source code + +Expand source code +
    def clone(self, model_id=None):
         """
         Creates a copy of the model except model_id.
    @@ -1943,9 +2099,11 @@ 

    Returns

    def describe_regularizers(self)
    -
    +
    -Source code + +Expand source code +
    def describe_regularizers(self):
         data = []
         for reg_name, reg in self.regularizers._data.items():
    @@ -1966,9 +2124,11 @@ 

    Returns

    def describe_scores(self, verbose=False)
    -
    +
    -Source code + +Expand source code +
    def describe_scores(self, verbose=False):
         data = []
         for score_name, score in self.scores.items():
    @@ -1985,9 +2145,11 @@ 

    Returns

    def get_init_parameters(self, not_include=None)
    -
    +
    -Source code + +Expand source code +
    def get_init_parameters(self, not_include=None):
         if not_include is None:
             not_include = list()
    @@ -2007,14 +2169,16 @@ 

    Returns

    def get_jsonable_from_parameters(self)
    -

    Gets artm model params.

    +

    Gets artm model params.

    Returns

    dict
    artm model parameters
    -
    +
    -Source code + +Expand source code +
    def get_jsonable_from_parameters(self):
         """
         Gets artm model params.
    @@ -2053,7 +2217,7 @@ 

    Returns

    def get_phi(self, topic_names=None, class_ids=None, model_name=None)
    -

    Gets custom Phi matrix of model.

    +

    Gets custom Phi matrix of model.

    Parameters

    topic_names : list of str or str
    @@ -2070,9 +2234,11 @@

    Returns

    pd.DataFrame
    phi matrix
    -
    +
    -Source code + +Expand source code +
    def get_phi(self, topic_names=None, class_ids=None, model_name=None):
         """
         Gets custom Phi matrix of model.
    @@ -2101,6 +2267,7 @@ 

    Returns

    class_ids = [class_ids] class_ids_iter = class_ids or self._model.class_ids # TODO: this workaround seems to be a correct solution to this problem + # maybe the next for-loop could be replaced with these three lines if not class_ids_iter: valid_model_name = self._model.model_pwt info = self._model.master.get_phi_info(valid_model_name) @@ -2124,7 +2291,7 @@

    Returns

    def get_phi_dense(self, topic_names=None, class_ids=None, model_name=None)
    -

    Gets custom Phi matrix of model.

    +

    Gets custom Phi matrix of model.

    Parameters

    topic_names : list of str or str
    @@ -2139,11 +2306,13 @@

    Parameters

    Returns

    -
    3-tuple
    +
    3-tuple
    dense phi matrix
    -
    +
    -Source code + +Expand source code +
    def get_phi_dense(self, topic_names=None, class_ids=None, model_name=None):
         """
         Gets custom Phi matrix of model.
    @@ -2173,7 +2342,7 @@ 

    Returns

    def get_phi_sparse(self, topic_names=None, class_ids=None, model_name=None, eps=None)
    -

    Gets custom Phi matrix of model as sparse scipy matrix.

    +

    Gets custom Phi matrix of model as sparse scipy matrix.

    Parameters

    topic_names : list of str or str
    @@ -2190,11 +2359,13 @@

    Parameters

    Returns

    -
    3-tuple
    +
    3-tuple
    sparse phi matrix
    -
    +
    -Source code + +Expand source code +
    def get_phi_sparse(self, topic_names=None, class_ids=None, model_name=None, eps=None):
         """
         Gets custom Phi matrix of model as sparse scipy matrix.
    @@ -2222,11 +2393,48 @@ 

    Returns

    return self._model.get_phi_sparse(topic_names, class_ids, model_name, eps)
    +
    +def get_regularizer(self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer] +
    +
    +

    Retrieves the regularizer specified, no matter is it custom or "classic"

    +

    Returns

    +
    +
    regularizer
    +
     
    +
    +
    + +Expand source code + +
    def get_regularizer(
    +        self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer]:
    +    """
    +    Retrieves the regularizer specified, no matter is it custom or "classic"
    +
    +    Returns
    +    -------
    +    regularizer
    +
    +    """
    +    # TODO: RegularizersWrapper?
    +
    +    if reg_name in self.custom_regularizers:
    +        return self.custom_regularizers[reg_name]
    +    elif reg_name in self._model.regularizers.data:
    +        return self._model.regularizers.data[reg_name]
    +    else:
    +        raise KeyError(
    +            f'There is no such regularizer "{reg_name}"'
    +            f' among custom and ARTM regularizers!'
    +        )
    +
    +
    def get_theta(self, topic_names=None, dataset=None, theta_matrix_type='dense_theta', predict_class_id=None, sparse=False, eps=None)
    -

    Gets Theta matrix as pandas DataFrame +

    Gets Theta matrix as pandas DataFrame or sparse scipy matrix.

    Parameters

    @@ -2254,9 +2462,11 @@

    Returns

    pd.DataFrame
    theta matrix
    -
    +
    -Source code + +Expand source code +
    def get_theta(self, topic_names=None,
                   dataset=None,
                   theta_matrix_type='dense_theta',
    @@ -2321,12 +2531,12 @@ 

    Returns

    def make_dummy(self, save_to_drive=True, save_path=None, dataset=None)
    -

    Makes topic model dummy in-place.

    +

    Makes topic model dummy in-place.

    Parameters

    save_to_drive : bool
    Whether to save model to drive or not. If not, the info will be lost
    -
    save_path : str (or None)
    +
    save_path : str (or None)
    Path to folder to dump info to
    dataset : Dataset
    Dataset with text collection on which the model was trained. @@ -2337,11 +2547,13 @@

    Notes

    but there is no ARTM model inside! (so model.get_phi() won't work!) If one wants to use the topic model as before, this ARTM model should be restored first:

    -
    >>> save_path = topic_model.model_default_save_path
    +
    >>> save_path = topic_model.model_default_save_path
     >>> topic_model._model = artm.load_artm_model(f'{save_path}/model')
    -
    +
    -Source code + +Expand source code +
    def make_dummy(self, save_to_drive=True, save_path=None, dataset=None):
         """Makes topic model dummy in-place.
     
    @@ -2394,7 +2606,7 @@ 

    Notes

    def save(self, model_save_path=None, phi=True, theta=False, dataset=None)
    -

    Saves model description and dumps artm model. +

    Saves model description and dumps artm model. Use this method if you want to dump the model.

    Parameters

    @@ -2406,9 +2618,11 @@

    Parameters

    save theta in csv format if True
    dataset : Dataset
    dataset
    -
    +
    -Source code + +Expand source code +
    def save(self,
              model_save_path=None,
              phi=True,
    @@ -2482,39 +2696,58 @@ 

    Parameters

    def save_custom_regularizers(self, model_save_path=None)
    -
    +
    -Source code + +Expand source code +
    def save_custom_regularizers(self, model_save_path=None):
         if model_save_path is None:
             model_save_path = self.model_default_save_path
     
         for regularizer_name, regularizer_object in self.custom_regularizers.items():
    -        try:
    -            save_path = os.path.join(model_save_path, regularizer_name + '.rd')
    -            with open(save_path, 'wb') as reg_f:
    -                dill.dump(regularizer_object, reg_f)
    -        except (TypeError, AttributeError):
    +        # If not do this, there may be problems with pickling:
    +        # `model` is an ARTM-C-like thing, and it may cause problems
    +        # This is safe, because `model` appears in attach(),
    +        # which is called before each iteration
    +        # P.S. and the `model` itself may be needed for a regularizer inside `grad()`
    +        regularizer_object._model = None
    +
    +        managed_to_pickle = False
    +
    +        for (pickler, extension) in zip([dill, pickle], ['.rd', '.rp']):
    +            save_path = os.path.join(model_save_path, regularizer_name + extension)
    +
                 try:
    -                save_path = os.path.join(model_save_path, regularizer_name + '.rp')
                     with open(save_path, 'wb') as reg_f:
    -                    pickle.dump(regularizer_object, reg_f)
    +                    pickler.dump(regularizer_object, reg_f)
                 except (TypeError, AttributeError):
    -                warnings.warn(f'Cannot save {regularizer_name} regularizer.')
    + if os.path.isfile(save_path): + os.remove(save_path) + else: + managed_to_pickle = True + + if managed_to_pickle: + break + + if not managed_to_pickle: + warnings.warn(f'Cannot save {regularizer_name} regularizer!')
    def select_topics(self, substrings, invert=False)
    -

    Gets all topics containing specified substring

    +

    Gets all topics containing specified substring

    Returns

    list
     
    -
    +
    -Source code + +Expand source code +
    def select_topics(self, substrings, invert=False):
         """
         Gets all topics containing specified substring
    @@ -2535,10 +2768,10 @@ 

    Returns

    def to_dummy(self, save_path=None)
    -

    Creates dummy model

    +

    Creates dummy model

    Parameters

    -
    save_path : str (or None)
    +
    save_path : str (or None)
    Path to folder with dumped info about topic model

    Returns

    @@ -2549,9 +2782,11 @@

    Returns

    Notes

    Dummy model has the same model_id as the original model, -but "model_id" key in experiment.models contains original model, not dummy

    +but "model_id" key in experiment.models contains original model, not dummy

    -Source code + +Expand source code +
    def to_dummy(self, save_path=None):
         """Creates dummy model
     
    @@ -2638,6 +2873,7 @@ 

    get_phi
  • get_phi_dense
  • get_phi_sparse
  • +
  • get_regularizer
  • get_theta
  • load
  • make_dummy
  • @@ -2656,7 +2892,7 @@

    diff --git a/docs/cooking_machine/models/topic_prior_regularizer.html b/docs/cooking_machine/models/topic_prior_regularizer.html index 0f066f0..7f31094 100644 --- a/docs/cooking_machine/models/topic_prior_regularizer.html +++ b/docs/cooking_machine/models/topic_prior_regularizer.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.models.topic_prior_regularizer API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.models.topic_prior_regul
    -Source code + +Expand source code +
    import numpy as np
     import warnings
     from .base_regularizer import BaseRegularizer
    @@ -154,7 +156,7 @@ 

    Classes

    (name, tau, num_topics=None, beta=1)
    -

    TopicPriorRegularizer adds prior beta_t to every column +

    TopicPriorRegularizer adds prior beta_t to every column in Phi matrix of ARTM model. Thus every phi_wt has preassigned prior probability of being attached to topic t.

    If beta is balanced with respect to apriori collection balance, @@ -169,9 +171,11 @@

    Parameters

    Number of topics for uniform sampling
    beta : float or list or np.array
    Prior for columns of Phi matrix (Default value = 1)
    -
    +
    -Source code + +Expand source code +
    class TopicPriorRegularizer(BaseRegularizer):
         """
         TopicPriorRegularizer adds prior beta_t to every column
    @@ -232,9 +236,11 @@ 

    Methods

    def grad(self, pwt, nwt)
    -
    +
    -Source code + +Expand source code +
    def grad(self, pwt, nwt):
         grad_array = np.repeat([self.beta * self.tau], pwt.shape[0], axis=0)
     
    @@ -256,7 +262,7 @@ 

    Inherited members

    (name, tau, num_topics=None, beta_prior=(), random_seed=42)
    -

    TopicPriorSampleRegularizer adds prior beta_t to every column +

    TopicPriorSampleRegularizer adds prior beta_t to every column in Phi matrix of ARTM model. Thus every phi_wt has preassigned prior probability of being attached to topic t.

    Beta vector is sampled from @@ -280,9 +286,11 @@

    Parameters

    Prior for Dirichlet distribution to sample beta parameter
    random_seed : int
    Random seed for Dirichlet distribution (Default value = 42)
    -
    +
    -Source code + +Expand source code +
    class TopicPriorSampledRegularizer(BaseRegularizer):
         """
         TopicPriorSampleRegularizer adds prior beta_t to every column
    @@ -354,9 +362,11 @@ 

    Methods

    def grad(self, pwt, nwt)
    -
    +
    -Source code + +Expand source code +
    def grad(self, pwt, nwt):
         grad_array = np.repeat([self.beta * self.tau], pwt.shape[0], axis=0)
     
    @@ -407,7 +417,7 @@ 

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/pretty_output.html b/docs/cooking_machine/pretty_output.html index 85a190d..597d6aa 100644 --- a/docs/cooking_machine/pretty_output.html +++ b/docs/cooking_machine/pretty_output.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.pretty_output API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.pretty_output

    -Source code + +Expand source code +
    import numpy as np
     
     from datetime import datetime
    @@ -336,7 +338,7 @@ 

    Functions

    def add_non_tree_strings(strings, strings_to_add, add_separator=True)
    -

    Adding training stage strings +

    Adding training stage strings to the experiment description

    Parameters

    @@ -352,9 +354,11 @@

    Returns

    strings : list of strings
    description of the experiment
    -
    +
    -Source code + +Expand source code +
    def add_non_tree_strings(strings, strings_to_add, add_separator=True):
         """
         Adding training stage strings
    @@ -390,11 +394,11 @@ 

    Returns

    -def get_criteria_strings(criteria, tab=' -', min_len_per_cube=26) +def get_criteria_strings(criteria, tab: str = ' +', min_len_per_cube: int = 26)
    -

    Parameters

    +

    Parameters

    criteria : list of str
     
    @@ -408,9 +412,11 @@

    Returns

    dict
     
    -
    +
    -Source code + +Expand source code +
    def get_criteria_strings(criteria, tab: str = "  ", min_len_per_cube: int = MODEL_NAME_LENGTH):
         """
     
    @@ -449,11 +455,11 @@ 

    Returns

    -def get_cube_strings(cubes, tab=' -', min_len_per_cube=26) +def get_cube_strings(cubes, tab: str = ' +', min_len_per_cube: int = 26)
    -

    Parameters

    +

    Parameters

    cubes : list of dict
     
    @@ -467,9 +473,11 @@

    Returns

    dict
     
    -
    +
    -Source code + +Expand source code +
    def get_cube_strings(cubes, tab: str = "  ", min_len_per_cube: int = MODEL_NAME_LENGTH):
         """
     
    @@ -509,10 +517,10 @@ 

    Returns

    -def get_html(experiment, window_size=1500) +def get_html(experiment, window_size: int = 1500)
    -

    Gets html text to save human-readable description of the experiment.

    +

    Gets html text to save human-readable description of the experiment.

    Parameters

    window_size : int
    @@ -522,9 +530,11 @@

    Returns

    str
    description of the experiment in html format
    -
    +
    -Source code + +Expand source code +
    def get_html(experiment, window_size: int = 1500):
         """
         Gets html text to save human-readable description of the experiment.
    @@ -564,11 +574,11 @@ 

    Returns

    -def give_strings_description(experiment, tab=' -', min_len_per_cube=26, len_tree_step=27) +def give_strings_description(experiment, tab: str = ' +', min_len_per_cube: int = 26, len_tree_step: int = 27)
    -

    Gets strings description of the experiment.

    +

    Gets strings description of the experiment.

    Parameters

    tab : str
    @@ -585,9 +595,11 @@

    Returns

    list
    strings description
    -
    +
    -Source code + +Expand source code +
    def give_strings_description(experiment,
                                  tab: str = "  ",
                                  min_len_per_cube: int = MODEL_NAME_LENGTH,
    @@ -662,9 +674,11 @@ 

    Returns

    def make_notebook_pretty()
    -
    +
    -Source code + +Expand source code +
    def make_notebook_pretty():
         from IPython.core.display import display, HTML
     
    @@ -684,15 +698,15 @@ 

    Returns

    -def resize_value(key, value, tab=' +def resize_value(key, value, tab: str = ' ')
    -

    Parameters

    +

    Parameters

    key : str
     
    -
    value : optional
    +
    value : optional
     
    tab : str
    (Default value = " @@ -702,9 +716,11 @@

    Returns

    list
     
    -
    +
    -Source code + +Expand source code +
    def resize_value(key, value, tab: str = "  "):
         """
     
    @@ -777,7 +793,7 @@ 

    Index

    diff --git a/docs/cooking_machine/recipes/artm_baseline_pipeline.html b/docs/cooking_machine/recipes/artm_baseline_pipeline.html index eede585..9c2ee47 100644 --- a/docs/cooking_machine/recipes/artm_baseline_pipeline.html +++ b/docs/cooking_machine/recipes/artm_baseline_pipeline.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.artm_baseline_pipeline API documentation - - + + @@ -21,11 +21,15 @@

    Module topicnet.cooking_machine.recipes.artm_baseline_pi
    -Source code + +Expand source code +
    from typing import List
    +
     from .recipe_wrapper import BaseRecipe
     from .. import Dataset
     
    +
     ARTM_baseline_template = '''
     # This config follows a strategy described by Murat Apishev
     # one of the core programmers of BigARTM library in personal correspondence.
    @@ -64,6 +68,7 @@ 

    Module topicnet.cooking_machine.recipes.artm_baseline_pi num_top_tokens: 30 model: dataset_path: {dataset_path} + {dictionary_filter_parameters} modalities_to_use: {modality_list} main_modality: '{main_modality}' @@ -86,6 +91,8 @@

    Module topicnet.cooking_machine.recipes.artm_baseline_pi use_relative_coefficients: true ''' +ONE_CONFIG_INDENT = 4 * ' ' + class BaselineRecipe(BaseRecipe): """ @@ -98,6 +105,7 @@

    Module topicnet.cooking_machine.recipes.artm_baseline_pi def format_recipe( self, dataset_path: str, + dictionary_filter_parameters: dict = None, modality_list: List[str] = None, topic_number: int = 20, background_topic_number: int = 1, @@ -110,13 +118,23 @@

    Module topicnet.cooking_machine.recipes.artm_baseline_pi background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, modality_list=modality_list, main_modality=modality_list[0], specific_topics=specific_topics, background_topics=background_topics, ) + return self._recipe

    @@ -133,10 +151,12 @@

    Classes

    class BaselineRecipe
    -

    Class for baseline recipe creation and -unification of recipe interface

    +

    Class for baseline recipe creation and +unification of recipe interface

    -Source code + +Expand source code +
    class BaselineRecipe(BaseRecipe):
         """
         Class for baseline recipe creation and
    @@ -148,6 +168,7 @@ 

    Classes

    def format_recipe( self, dataset_path: str, + dictionary_filter_parameters: dict = None, modality_list: List[str] = None, topic_number: int = 20, background_topic_number: int = 1, @@ -160,13 +181,23 @@

    Classes

    background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, modality_list=modality_list, main_modality=modality_list[0], specific_topics=specific_topics, background_topics=background_topics, ) + return self._recipe

    Ancestors

    @@ -208,7 +239,7 @@

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/recipes/exploratory_search_pipeline.html b/docs/cooking_machine/recipes/exploratory_search_pipeline.html index 9fee611..28b8686 100644 --- a/docs/cooking_machine/recipes/exploratory_search_pipeline.html +++ b/docs/cooking_machine/recipes/exploratory_search_pipeline.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.exploratory_search_pipeline API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.recipes.exploratory_sear
    -Source code + +Expand source code +
    from .recipe_wrapper import BaseRecipe
     from .. import Dataset
     
    @@ -44,6 +46,10 @@ 

    Module topicnet.cooking_machine.recipes.exploratory_sear # specific_topics=specific_topics, background_topics=background_topics) # when loading the recipe to adjust for your dataset +# If you have more than one modaity you want to use, we recommend employing +# more advanced MultimodalSearchRecipe from multimodal_exploratory_search_pipeline instead + + topics: # Describes number of model topics, in the actuall article 200 topics were found to be optimal specific_topics: {{specific_topics}} @@ -166,10 +172,12 @@

    Classes

    class SearchRecipe
    -

    Class for baseline recipe creation and -unification of recipe interface

    +

    Class for baseline recipe creation and +unification of recipe interface

    -Source code + +Expand source code +
    class SearchRecipe(BaseRecipe):
         """
         Class for baseline recipe creation and
    @@ -239,7 +247,7 @@ 

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/recipes/index.html b/docs/cooking_machine/recipes/index.html index 1642adf..603a2bc 100644 --- a/docs/cooking_machine/recipes/index.html +++ b/docs/cooking_machine/recipes/index.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes API documentation - - + + @@ -19,19 +19,11 @@

    Module topicnet.cooking_machine.recipes

    -

    TopicNet Recipes

    -

    This module contains mechanisms to generate code for training topic models on your data. It was created in orded to simplify knowledge transition about model training from the field researchers to the end users and possibly for easier exchange of a code between the research groups. As a backbone it uses snippets of YAML configs that require filling in information about the collection and hyperparameters of the required topic model. Currently it is recommended to import BaselineRecipe, SearchRecipe, MultimodalSearchRecipe classes for the experiment environment generation. However, for the compatibility with previous examples found in topicnet/demos/*-Recipe.ipynb notebooks we also have ARTM-baseline and exploratory_search configs in YAML format.

    -
    -
      -
    • BaselineRecipe - Class for generating a pipeline training a topic models with decorrelation regularization, maximizing custom BleiLafferty score from TopicNet library topicnet.cooking_machine.models.scores.BleiLaffertyScore.
    • -
    • SearchRecipe - a Class recreating training scenario from exploratory_search YAML config. Provides good startegy for training topic models for collection search properties. A link to the publication can be found in the comments section of the recipe.
    • -
    • MultimodalSearchRecipe - a Class that modifies previos strategy for the case of multimodal data allowing to recreate previous scenario for each modality separately.
    • -
    • intratext_coherence_maximization.yml - a strin in YAML format (like the old recipes) allowing to build topic model with decorrelation, Phi and Theta matrices Sparsing and Smoothing with background topics maximizing the intratext coherence score topicnet.cooking_machine.models.scores.IntratextCoherenceScore.
    • -
    • topic_number_search.yml - a recipe recreating published strategy to find optimal topic number for given dataset. References to the publication can be found in the config dosctring.
    • -
    -Source code + +Expand source code +
    from .multimodal_exploratory_search_pipeline import MultimodalSearchRecipe
     from .artm_baseline_pipeline import BaselineRecipe
     from .intratext_coherence_pipeline import IntratextCoherenceRecipe
    @@ -45,27 +37,27 @@ 

    Sub-modules

    topicnet.cooking_machine.recipes.artm_baseline_pipeline
    -
    +
    topicnet.cooking_machine.recipes.exploratory_search_pipeline
    -
    +
    topicnet.cooking_machine.recipes.intratext_coherence_pipeline
    -
    +
    topicnet.cooking_machine.recipes.multimodal_exploratory_search_pipeline
    -
    +
    topicnet.cooking_machine.recipes.recipe_wrapper
    -
    +
    topicnet.cooking_machine.recipes.wntm
    -
    +
    @@ -77,7 +69,7 @@

    Sub-modules

    diff --git a/docs/cooking_machine/recipes/intratext_coherence_pipeline.html b/docs/cooking_machine/recipes/intratext_coherence_pipeline.html index bbf8bf9..962db37 100644 --- a/docs/cooking_machine/recipes/intratext_coherence_pipeline.html +++ b/docs/cooking_machine/recipes/intratext_coherence_pipeline.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.intratext_coherence_pipeline API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.recipes.intratext_cohere
    -Source code + +Expand source code +
    import os
     import warnings
     
    @@ -30,6 +32,8 @@ 

    Module topicnet.cooking_machine.recipes.intratext_cohere from .recipe_wrapper import BaseRecipe from .. import Dataset +ONE_CONFIG_INDENT = 4 * ' ' + class IntratextCoherenceRecipe(BaseRecipe): """ @@ -59,6 +63,7 @@

    Module topicnet.cooking_machine.recipes.intratext_cohere dataset_path: str, num_specific_topics: int, main_modality: str = None, + dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, @@ -142,10 +147,19 @@

    Module topicnet.cooking_machine.recipes.intratext_cohere for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, @@ -171,7 +185,7 @@

    Classes

    class IntratextCoherenceRecipe
    -

    The recipe mainly consists of basic cube stages, +

    The recipe mainly consists of basic cube stages, such as Decorrelation, Sparsing and Smoothing. In this way it is similar to ARTM baseline recipe. The core difference is that models selected based on their IntratextCoherenceScore @@ -179,9 +193,11 @@

    Classes

    PerplexityScore is also calculated to assure that models don't have high perplexity, but the main criteria is IntratextCoherenceScore.

    For more details about IntratextCoherence -one may see the paper http://www.dialog-21.ru/media/4281/alekseevva.pdf

    +one may see the paper http://www.dialog-21.ru/media/4281/alekseevva.pdf

    -Source code + +Expand source code +
    class IntratextCoherenceRecipe(BaseRecipe):
         """
         The recipe mainly consists of basic cube stages,
    @@ -210,6 +226,7 @@ 

    Classes

    dataset_path: str, num_specific_topics: int, main_modality: str = None, + dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, @@ -293,10 +310,19 @@

    Classes

    for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, @@ -315,10 +341,10 @@

    Ancestors

    Methods

    -def format_recipe(self, dataset_path, num_specific_topics, main_modality=None, num_background_topics=1, modalities=None, keep_dataset_in_memory=True, keep_dataset=False, documents_fraction=0.5, one_stage_num_iter=20, verbose=True) +def format_recipe(self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) -> str
    -

    Parameters

    +

    Parameters

    dataset_path
    Path to the dataset .csv file
    @@ -370,14 +396,17 @@

    Methods

    and 5 * 5 coherence computations (here may be slow if documents_fraction is high)
    verbose
    Whether to show experiment progress or not
    -

    +
    -Source code + +Expand source code +
    def format_recipe(
             self,
             dataset_path: str,
             num_specific_topics: int,
             main_modality: str = None,
    +        dictionary_filter_parameters: dict = None,
             num_background_topics: int = 1,
             modalities: List[str] = None,
             keep_dataset_in_memory: bool = True,
    @@ -461,10 +490,19 @@ 

    Methods

    for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, @@ -515,7 +553,7 @@

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html b/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html index b8568c0..cf9953b 100644 --- a/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html +++ b/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.multimodal_exploratory_search_pipeline API documentation - - + + @@ -21,8 +21,10 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo
    -Source code -
    from typing import List, Union
    +
    +Expand source code
    +
    +
    from typing import List, Union, Dict
     from .recipe_wrapper import BaseRecipe
     from .. import Dataset
     
    @@ -51,7 +53,7 @@ 

    Module topicnet.cooking_machine.recipes.multimodal_explo model: dataset_path: {dataset_path} - modalities_to_use: {modality_list} + {modalities_description} main_modality: '{modality}' stages: @@ -91,7 +93,7 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo verbose: false use_relative_coefficients: True '''.format('PerplexityScore@all < 1.01 * MINIMUM(PerplexityScore@all)' + - ' and SparsityPhiScore{modality} -> max') + ' and SparsityThetaScore -> max') # Had to change tracked score function. Is it fine? decor_phi_cube_template = ''' @@ -104,9 +106,9 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo - {0} strategy: PerplexityStrategy strategy_params: - start_point: 0 - step: 0.02 - max_len: 20 + start_point: 0.005 + step: 0.005 + max_len: 10 tracked_score_function: PerplexityScore{{modality}} verbose: false use_relative_coefficients: True @@ -158,11 +160,38 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo def format_recipe( self, dataset_path: str, - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, + main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): + ''' + Creates a recipe for multimodal search + using basic template at the top of this file + + Parameters + ---------- + dataset_path : path to the data + main_modality : str + chosen to be main modality from modality list, if possible + if it is not specified, the function attempts to user + the first entry of `modality_list` instead + + modality_list : list of modality names to use + or a dict specifying the (relative) weight of each + topic_number: + number of the model topics + background_topic_number : + number of background topics + num_iter : + specifying number of iterations for each cube + + Returns + ------- + string specifying recipe for multimodal search + ''' + if modality_list is None: modality_list = list(Dataset(dataset_path).get_possible_modalities()) @@ -170,8 +199,13 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if main_modality is None: + if isinstance(modality_list, list): + main_modality = modality_list[0] + else: + raise TypeError("main_modality should be specified") self._make_multimodal_recipe( - modality=modality_list[0], + modality=main_modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, @@ -235,14 +269,12 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo num_iter=iterations)) cube_templates.append(smooth_phi_cube_template.format(modality=modality, num_iter=iterations)) - cube_templates.append(sparse_theta_cube_template.format(modality=modality, - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) else: raise ValueError('That option is not availiable') if self._order == 'extended_modalities': iterations = num_iter[-1] - cube_templates.append(sparse_theta_cube_template.format(modality=modality_list[0], - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) return ''.join(cube_templates) def _make_multimodal_recipe( @@ -251,41 +283,31 @@

    Module topicnet.cooking_machine.recipes.multimodal_explo modality: str, specific_topics: List[str], background_topics: List[str], - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): - ''' - Creates a recipe for multimodal search - using basic template at the top of this file - - Parameters - ---------- - dataset_path : path to the data - modality : str - chosen to be main modality from modality list - modality_list : list of modality names to use - specific_topics : list of str - names of the model topics - background_topics : list of background topic names - num_iter : number or list of numbers - specifying number of iterations for each cube - - Returns - ------- - string specifying recipe for multimodal search - ''' - reg_forms = self._form_regularizers(modality_list) cube_forms = self._form_and_order_cubes( modality_list, num_iter=num_iter,) + if isinstance(modality_list, list): + modalities_description = f"modalities_to_use: {modality_list}" + elif isinstance(modality_list, dict): + # this line has correct whitespace count + header_string = "modalities_weights:" + # these ones should be indented one level more, so 8 spaces + data_strings = [f"'{k}': {v}" for k, v in modality_list.items()] + strings = [header_string] + data_strings + modalities_description = "\n ".join(strings) + else: + raise TypeError("modality_list should be either list or dict, not {type(modality_list}") self._recipe = self.recipe_template.format( modality=modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, - modality_list=modality_list, + modalities_description=modalities_description, syntesized_regularizers=reg_forms, syntesized_stages=cube_forms)

    @@ -304,7 +326,7 @@

    Classes

    (order='extended_modalities')
    -

    Class for multimodal search recipe creation and +

    Class for multimodal search recipe creation and unification of recipe usage interface

    Parameters

    @@ -314,9 +336,11 @@

    Parameters

    for each dataset modality while 'extended_modalities' extends only modality-reliant blocks of training keeping last part equivalent to the original pipeline
    -
    +
    -Source code + +Expand source code +
    class MultimodalSearchRecipe(BaseRecipe):
         """
         Class for multimodal search recipe creation and
    @@ -340,11 +364,38 @@ 

    Parameters

    def format_recipe( self, dataset_path: str, - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, + main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): + ''' + Creates a recipe for multimodal search + using basic template at the top of this file + + Parameters + ---------- + dataset_path : path to the data + main_modality : str + chosen to be main modality from modality list, if possible + if it is not specified, the function attempts to user + the first entry of `modality_list` instead + + modality_list : list of modality names to use + or a dict specifying the (relative) weight of each + topic_number: + number of the model topics + background_topic_number : + number of background topics + num_iter : + specifying number of iterations for each cube + + Returns + ------- + string specifying recipe for multimodal search + ''' + if modality_list is None: modality_list = list(Dataset(dataset_path).get_possible_modalities()) @@ -352,8 +403,13 @@

    Parameters

    background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if main_modality is None: + if isinstance(modality_list, list): + main_modality = modality_list[0] + else: + raise TypeError("main_modality should be specified") self._make_multimodal_recipe( - modality=modality_list[0], + modality=main_modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, @@ -417,14 +473,12 @@

    Parameters

    num_iter=iterations)) cube_templates.append(smooth_phi_cube_template.format(modality=modality, num_iter=iterations)) - cube_templates.append(sparse_theta_cube_template.format(modality=modality, - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) else: raise ValueError('That option is not availiable') if self._order == 'extended_modalities': iterations = num_iter[-1] - cube_templates.append(sparse_theta_cube_template.format(modality=modality_list[0], - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) return ''.join(cube_templates) def _make_multimodal_recipe( @@ -433,41 +487,31 @@

    Parameters

    modality: str, specific_topics: List[str], background_topics: List[str], - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): - ''' - Creates a recipe for multimodal search - using basic template at the top of this file - - Parameters - ---------- - dataset_path : path to the data - modality : str - chosen to be main modality from modality list - modality_list : list of modality names to use - specific_topics : list of str - names of the model topics - background_topics : list of background topic names - num_iter : number or list of numbers - specifying number of iterations for each cube - - Returns - ------- - string specifying recipe for multimodal search - ''' - reg_forms = self._form_regularizers(modality_list) cube_forms = self._form_and_order_cubes( modality_list, num_iter=num_iter,) + if isinstance(modality_list, list): + modalities_description = f"modalities_to_use: {modality_list}" + elif isinstance(modality_list, dict): + # this line has correct whitespace count + header_string = "modalities_weights:" + # these ones should be indented one level more, so 8 spaces + data_strings = [f"'{k}': {v}" for k, v in modality_list.items()] + strings = [header_string] + data_strings + modalities_description = "\n ".join(strings) + else: + raise TypeError("modality_list should be either list or dict, not {type(modality_list}") self._recipe = self.recipe_template.format( modality=modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, - modality_list=modality_list, + modalities_description=modalities_description, syntesized_regularizers=reg_forms, syntesized_stages=cube_forms)
    @@ -475,12 +519,104 @@

    Ancestors

    +

    Methods

    +
    +
    +def format_recipe(self, dataset_path: str, modality_list: List[str] = None, main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20) +
    +
    +

    Creates a recipe for multimodal search +using basic template at the top of this file

    +

    Parameters

    +
    +
    dataset_path : path to the data
    +
     
    +
    main_modality : str
    +
    chosen to be main modality from modality list, if possible +if it is not specified, the function attempts to user +the first entry of modality_list instead
    +
    modality_list : list of modality names to use
    +
    or a dict specifying the (relative) weight of each
    +
    +

    topic_number: +number of the model topics +background_topic_number : +number of background topics +num_iter : +specifying number of iterations for each cube

    +

    Returns

    +
    +
    string specifying recipe for multimodal search
    +
     
    +
    +
    + +Expand source code + +
    def format_recipe(
    +    self,
    +    dataset_path: str,
    +    modality_list: List[str] or Dict = None,
    +    main_modality: str = None,
    +    topic_number: int = 20,
    +    background_topic_number: int = 1,
    +    num_iter: Union[int, List[int]] = 20,
    +):
    +    '''
    +    Creates a recipe for multimodal search
    +    using basic template at the top of this file
    +
    +    Parameters
    +    ----------
    +    dataset_path : path to the data
    +    main_modality : str
    +        chosen to be main modality from modality list, if possible
    +        if it is not specified, the function attempts to user
    +        the first entry of `modality_list` instead
    +
    +    modality_list : list of modality names to use
    +                    or a dict specifying the (relative) weight of each
    +    topic_number:
    +        number of the model topics
    +    background_topic_number :
    +        number of background topics
    +    num_iter :
    +        specifying number of iterations for each cube
    +
    +    Returns
    +    -------
    +    string specifying recipe for multimodal search
    +    '''
    +
    +    if modality_list is None:
    +        modality_list = list(Dataset(dataset_path).get_possible_modalities())
    +
    +    specific_topics = [f'topic_{i}' for i in range(topic_number)]
    +    background_topics = [f'bcg_{i}' for i in range(
    +        len(specific_topics), len(specific_topics) + background_topic_number)]
    +
    +    if main_modality is None:
    +        if isinstance(modality_list, list):
    +            main_modality = modality_list[0]
    +        else:
    +            raise TypeError("main_modality should be specified")
    +    self._make_multimodal_recipe(
    +        modality=main_modality,
    +        dataset_path=dataset_path,
    +        specific_topics=specific_topics,
    +        background_topics=background_topics,
    +        modality_list=modality_list,
    +        num_iter=num_iter,
    +    )
    +    return self._recipe
    +
    +
    +

    Inherited members

    @@ -503,6 +639,9 @@

    Index

    @@ -510,7 +649,7 @@

    -

    Generated by pdoc 0.6.3.

    +

    Generated by pdoc 0.8.1.

    diff --git a/docs/cooking_machine/recipes/recipe_wrapper.html b/docs/cooking_machine/recipes/recipe_wrapper.html index 34509cd..b4173ef 100644 --- a/docs/cooking_machine/recipes/recipe_wrapper.html +++ b/docs/cooking_machine/recipes/recipe_wrapper.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.recipe_wrapper API documentation - - + + @@ -21,12 +21,21 @@

    Module topicnet.cooking_machine.recipes.recipe_wrapper
    -Source code -
    from typing import Tuple
    +
    +Expand source code
    +
    +
    from typing import (
    +    Dict,
    +    Tuple,
    +    Union,
    +)
     
     from .. import Dataset
     from .. import Experiment
    -from ..config_parser import build_experiment_environment_from_yaml_config
    +from ..config_parser import (
    +    build_experiment_environment_from_yaml_config,
    +    KEY_DICTIONARY_FILTER_PARAMETERS,
    +)
     
     
     recipe_template_example = """
    @@ -54,19 +63,18 @@ 

    Module topicnet.cooking_machine.recipes.recipe_wrapperModule topicnet.cooking_machine.recipes.recipe_wrapperModule topicnet.cooking_machine.recipes.recipe_wrapper

    @@ -108,9 +140,11 @@

    Classes

    (recipe_template)
    -

    Base class to work with recipes

    +

    Base class to work with recipes

    -Source code + +Expand source code +
    class BaseRecipe:
         """
         Base class to work with recipes
    @@ -127,19 +161,18 @@ 

    Classes

    def format_recipe(self, *args, **kwargs) -> str: """ - Updates self._recipe variable - with variables specific for the dataset + Updates `self._recipe` + with variables specific for the dataset. """ raise NotImplementedError( 'Method needs to be specified for the recipe template' ) - return self._recipe def build_experiment_environment( self, save_path: str, experiment_id: str = 'default_experiment_name', - force_separate_thread: bool = False + force_separate_thread: bool = False, ) -> Tuple[Experiment, Dataset]: """ Returns experiment and dataset instances @@ -148,11 +181,13 @@

    Classes

    Parameters ---------- - save_path: path to the folder to save experiment logs and models - experiment_id: name of the experiment folder - force_separate_thread: train each model in dedicated process + save_path + path to the folder to save experiment logs and models + experiment_id + name of the experiment folder + force_separate_thread + train each model in dedicated process; this feature helps to handle resources in Jupyter notebooks - """ if self._recipe is None: raise ValueError( @@ -163,42 +198,67 @@

    Classes

    self._recipe, save_path=save_path, experiment_id=experiment_id, - force_separate_thread=force_separate_thread + force_separate_thread=force_separate_thread, + ) + + @staticmethod + def _format_dictionary_filter_parameters( + parameters: Dict[Union[int, float, str, bool], Union[int, float, str, bool]], + indent: str) -> str: + + blank_dictionary = '{}' + + if len(parameters) == 0: + parameters_block = blank_dictionary + else: + parameters_block = '\n'.join([ + f'{indent}{k}: {v}' + for k, v in parameters.items() + ]) + + return ( + KEY_DICTIONARY_FILTER_PARAMETERS + + ':' + + ('\n' if parameters_block != blank_dictionary else ' ') + + parameters_block )

    Subclasses

    Methods

    -def build_experiment_environment(self, save_path, experiment_id='default_experiment_name', force_separate_thread=False) +def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) -> Tuple[ExperimentDataset]
    -

    Returns experiment and dataset instances +

    Returns experiment and dataset instances needed to perform the hyperparameter tuning on the data according to recipe

    Parameters

    -
    save_path : path to the folder to save experiment logs and models
    -
     
    -
    experiment_id : name of the experiment folder
    -
     
    -
    force_separate_thread : train each model in dedicated process
    -
    this feature helps to handle resources in Jupyter notebooks
    -
    +
    save_path
    +
    path to the folder to save experiment logs and models
    +
    experiment_id
    +
    name of the experiment folder
    +
    force_separate_thread
    +
    train each model in dedicated process; +this feature helps to handle resources in Jupyter notebooks
    +
    -Source code + +Expand source code +
    def build_experiment_environment(
             self,
             save_path: str,
             experiment_id: str = 'default_experiment_name',
    -        force_separate_thread: bool = False
    +        force_separate_thread: bool = False,
     ) -> Tuple[Experiment, Dataset]:
         """
         Returns experiment and dataset instances
    @@ -207,11 +267,13 @@ 

    Parameters

    Parameters ---------- - save_path: path to the folder to save experiment logs and models - experiment_id: name of the experiment folder - force_separate_thread: train each model in dedicated process + save_path + path to the folder to save experiment logs and models + experiment_id + name of the experiment folder + force_separate_thread + train each model in dedicated process; this feature helps to handle resources in Jupyter notebooks - """ if self._recipe is None: raise ValueError( @@ -222,27 +284,28 @@

    Parameters

    self._recipe, save_path=save_path, experiment_id=experiment_id, - force_separate_thread=force_separate_thread + force_separate_thread=force_separate_thread, )
    -def format_recipe(self, *args, **kwargs) +def format_recipe(self, *args, **kwargs) -> str
    -

    Updates self._recipe variable -with variables specific for the dataset

    +

    Updates self._recipe +with variables specific for the dataset.

    -Source code + +Expand source code +
    def format_recipe(self, *args, **kwargs) -> str:
         """
    -    Updates self._recipe variable
    -    with variables specific for the dataset
    +    Updates `self._recipe`
    +    with variables specific for the dataset.
         """
         raise NotImplementedError(
             'Method needs to be specified for the recipe template'
    -    )
    -    return self._recipe
    + )

    @@ -276,7 +339,7 @@

    diff --git a/docs/cooking_machine/recipes/wntm.html b/docs/cooking_machine/recipes/wntm.html index 337579e..05bb7cb 100644 --- a/docs/cooking_machine/recipes/wntm.html +++ b/docs/cooking_machine/recipes/wntm.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.recipes.wntm API documentation - - + + @@ -21,7 +21,9 @@

    Module topicnet.cooking_machine.recipes.wntm

    -Source code + +Expand source code +
    from typing import List, Tuple
     
     from .recipe_wrapper import BaseRecipe
    @@ -167,10 +169,12 @@ 

    Classes

    class WNTMRecipe
    -

    Class for baseline recipe creation and -unification of recipe interface

    +

    Class for baseline recipe creation and +unification of recipe interface

    -Source code + +Expand source code +
    class WNTMRecipe(BaseRecipe):
         """
         Class for baseline recipe creation and
    @@ -247,11 +251,68 @@ 

    Ancestors

    +

    Methods

    +
    +
    +def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) -> Tuple[ExperimentDataset] +
    +
    +

    Returns experiment and dataset instances +needed to perform the hyperparameter tuning on the data +according to recipe

    +

    Parameters

    +
    +
    save_path : path to the folder to save experiment logs and models
    +
     
    +
    experiment_id : name of the experiment folder
    +
     
    +
    force_separate_thread : train each model in dedicated process
    +
    this feature helps to handle resources in Jupyter notebooks
    +
    +
    + +Expand source code + +
    def build_experiment_environment(
    +        self,
    +        save_path: str,
    +        experiment_id: str = 'default_experiment_name',
    +        force_separate_thread: bool = False
    +) -> Tuple[Experiment, Dataset]:
    +    """
    +    Returns experiment and dataset instances
    +    needed to perform the hyperparameter tuning on the data
    +    according to recipe
    +
    +    Parameters
    +    ----------
    +    save_path: path to the folder to save experiment logs and models
    +    experiment_id: name of the experiment folder
    +    force_separate_thread: train each model in dedicated process
    +        this feature helps to handle resources in Jupyter notebooks
    +
    +    """
    +    if self._recipe is None:
    +        raise ValueError(
    +            'Recipe missing data specific parameters. '
    +            'Provide them with "format_recipe" method!')
    +
    +    settings, regs, model, dataset = parse(
    +        self._recipe,
    +        force_separate_thread=force_separate_thread,
    +        dataset_class=DatasetCooc
    +    )
    +    # TODO: handle dynamic addition of regularizers
    +    experiment = Experiment(experiment_id=experiment_id, save_path=save_path, topic_model=model)
    +    experiment.build(settings)
    +    return experiment, dataset
    +
    +
    +

    Inherited members

    • BaseRecipe:
    • @@ -275,6 +336,9 @@

      Index

      @@ -282,7 +346,7 @@

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/cooking_machine/rel_toolbox_lite.html b/docs/cooking_machine/rel_toolbox_lite.html index 415d3be..2a9d352 100644 --- a/docs/cooking_machine/rel_toolbox_lite.html +++ b/docs/cooking_machine/rel_toolbox_lite.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.rel_toolbox_lite API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.cooking_machine.rel_toolbox_lite<
      -Source code + +Expand source code +
      import os
       
       
      @@ -240,9 +242,11 @@ 

      Functions

      def calc_docs_avg_len(ds, weights)
      -
      +
      -Source code + +Expand source code +
      def calc_docs_avg_len(ds, weights):
           (modality_count, modality_vocab_size, n_docs) = ds
           docs_total_len = 0
      @@ -257,9 +261,11 @@ 

      Functions

      def compute_regularizer_gimel(tokens_data, reg, modality_weights, n_topics)
      -
      +
      -Source code + +Expand source code +
      def compute_regularizer_gimel(tokens_data, reg, modality_weights, n_topics):
       
           (modality_count, modality_vocab_size, num_docs) = tokens_data
      @@ -287,9 +293,11 @@ 

      Functions

      def compute_regularizer_tau(tokens_data, reg, modality_weights, n_topics)
      -
      +
      -Source code + +Expand source code +
      def compute_regularizer_tau(tokens_data, reg, modality_weights, n_topics):
       
           (modality_count, modality_vocab_size, num_docs) = tokens_data
      @@ -319,9 +327,11 @@ 

      Functions

      def count_vocab_size(dictionary, modalities)
      -
      +
      -Source code + +Expand source code +
      def count_vocab_size(dictionary, modalities):
           # TODO: check tokens filtered by dict.filter()
           fname = 'tmp.txt'
      @@ -347,7 +357,7 @@ 

      Functions

      def handle_regularizer(use_relative_coefficients, model, regularizer, data_stats)
      -

      Handles the case of various regularizers that +

      Handles the case of various regularizers that contain 'Regularizer' in their name, namely all artm regularizers

      Parameters

      @@ -355,7 +365,7 @@

      Parameters

      indicates whether regularizer should be altered
      model : TopicModel or artm.ARTM
      to be changed in place
      -
      regularizer : an instance of Regularizer from artm library
      +
      regularizer : an instance of Regularizer from artm library
       
      data_stats : dict
      collection-specific data
      @@ -364,9 +374,11 @@

      Returns

      None
       
      -
      +
      -Source code + +Expand source code +
      def handle_regularizer(use_relative_coefficients, model, regularizer, data_stats):
           """
           Handles the case of various regularizers that
      @@ -416,9 +428,11 @@ 

      Returns

      def modality_weight_rel2abs(tokens_data, weights, default_modality)
      -
      +
      -Source code + +Expand source code +
      def modality_weight_rel2abs(tokens_data, weights, default_modality):
           (modality_count, modality_vocab_size, num_docs) = tokens_data
           taus = {}
      @@ -437,9 +451,11 @@ 

      Returns

      def phi_weight_abs2rel(ds, modality_weights, n_topics, tau, modalities_list=None)
      -
      +
      -Source code + +Expand source code +
      def phi_weight_abs2rel(ds, modality_weights, n_topics, tau, modalities_list=None):
           (modality_count, modality_vocab_size, n_docs) = ds
           if modalities_list is None:
      @@ -461,9 +477,11 @@ 

      Returns

      def phi_weight_rel2abs(ds, modality_weights, n_topics, gimel, modalities_list=None)
      -
      +
      -Source code + +Expand source code +
      def phi_weight_rel2abs(ds, modality_weights, n_topics, gimel, modalities_list=None):
           (modality_count, modality_vocab_size, n_docs) = ds
           if modalities_list is None:
      @@ -484,9 +502,11 @@ 

      Returns

      def theta_weight_abs2rel(ds, modality_weights, n_topics, tau)
      -
      +
      -Source code + +Expand source code +
      def theta_weight_abs2rel(ds, modality_weights, n_topics, tau):
           avg_doc_len = calc_docs_avg_len(ds, modality_weights)
           gimel_multiplier = avg_doc_len / n_topics + tau
      @@ -498,9 +518,11 @@ 

      Returns

      def theta_weight_rel2abs(ds, modality_weights, n_topics, gimel)
      -
      +
      -Source code + +Expand source code +
      def theta_weight_rel2abs(ds, modality_weights, n_topics, gimel):
           avg_doc_len = calc_docs_avg_len(ds, modality_weights)
           tau = (avg_doc_len / n_topics) * gimel / (1 - gimel)
      @@ -511,9 +533,11 @@ 

      Returns

      def transform_regularizer(tokens_data, reg, modality_weights, n_topics=None)
      -
      +
      -Source code + +Expand source code +
      def transform_regularizer(tokens_data, reg, modality_weights, n_topics=None):
       
           if n_topics is None and len(reg.topic_names) == 0:
      @@ -569,7 +593,7 @@ 

      Index

      diff --git a/docs/cooking_machine/routine.html b/docs/cooking_machine/routine.html index de9150d..a624a2c 100644 --- a/docs/cooking_machine/routine.html +++ b/docs/cooking_machine/routine.html @@ -3,14 +3,14 @@ - + topicnet.cooking_machine.routine API documentation - - + + @@ -21,15 +21,20 @@

      Module topicnet.cooking_machine.routine

      -Source code -
      import numpy as np
      +
      +Expand source code
      +
      +
      import glob
       import hashlib
       import json
      +import numexpr as ne
      +import numpy as np
      +import os
       import re
       import warnings
      +
       from datetime import datetime
       from statistics import mean, median
      -import numexpr as ne
       
       
       W_TOO_STRICT = 'No models match criteria '
      @@ -465,7 +470,7 @@ 

      Module topicnet.cooking_machine.routine

      models_num = models_num_from_query if models_num is not None and int(models_num) < 0: - raise ValueError(f"Cannot return negative number of models") + raise ValueError("Cannot return negative number of models") return models_num @@ -688,7 +693,30 @@

      Module topicnet.cooking_machine.routine

      if not data: break m.update(data) - return m.hexdigest()
      + return m.hexdigest() + + +def load_models_from_disk(experiment_directory, base_experiment_name): + """ + Is useful for restoring failed experiment + """ + from topicnet.cooking_machine.experiment import START + from topicnet.cooking_machine.models import DummyTopicModel + + result_models = [] + + mask = f"{experiment_directory}/{base_experiment_name}_*" + msg = (f'Trying to load models from {mask}.' + f' {len(glob.glob(mask))} models found.') + print(msg) + for folder in glob.glob(mask): + model_pathes = [ + f.path for f in os.scandir(folder) + if f.is_dir() and f.name != START + ] + result_models += [DummyTopicModel.load(path) for path in model_pathes] + + return result_models
      @@ -702,14 +730,16 @@

      Functions

      def blake2bchecksum(file_path)
      -

      Calculates hash of the file

      +

      Calculates hash of the file

      Parameters

      file_path : str
      path to the file
      -
      +
      -Source code + +Expand source code +
      def blake2bchecksum(file_path):
           """
           Calculates hash of the file
      @@ -730,10 +760,10 @@ 

      Parameters

      -def choose_best_models(models, requirement_lesser, requirement_greater, requirement_equal, metric, extremum='min', models_num=None) +def choose_best_models(models: list, requirement_lesser: list, requirement_greater: list, requirement_equal: list, metric: str, extremum='min', models_num=None)
      -

      Get best model according to specified metric.

      +

      Get best model according to specified metric.

      Parameters

      models : list of TopicModel
      @@ -759,9 +789,11 @@

      Returns

      best_models : list of models
      models with best scores or matching request
      -
      +
      -Source code + +Expand source code +
      def choose_best_models(models: list, requirement_lesser: list, requirement_greater: list,
                              requirement_equal: list, metric: str, extremum="min", models_num=None):
           """
      @@ -821,12 +853,14 @@ 

      Returns

      -def choose_value_for_models_num_and_check(models_num_as_parameter, models_num_from_query) +def choose_value_for_models_num_and_check(models_num_as_parameter, models_num_from_query) -> int
      -
      +
      -Source code + +Expand source code +
      def choose_value_for_models_num_and_check(
               models_num_as_parameter, models_num_from_query) -> int:
       
      @@ -851,7 +885,7 @@ 

      Returns

      models_num = models_num_from_query if models_num is not None and int(models_num) < 0: - raise ValueError(f"Cannot return negative number of models") + raise ValueError("Cannot return negative number of models") return models_num
      @@ -860,9 +894,11 @@

      Returns

      def compute_special_queries(special_models, special_queries)
      -

      Computes special queries with functions.

      +

      Computes special queries with functions.

      -Source code + +Expand source code +
      def compute_special_queries(special_models, special_queries):
           """
           Computes special queries with functions.
      @@ -908,7 +944,7 @@ 

      Returns

      def extract_required_parameter(model, parameter)
      -

      Extracts necessary parameter from model.

      +

      Extracts necessary parameter from model.

      Parameters

      model : TopicModel
      @@ -918,11 +954,13 @@

      Parameters

      Returns

      -
      optional
      +
      optional
       
      -
      +
      -Source code + +Expand source code +
      def extract_required_parameter(model, parameter):
           """
           Extracts necessary parameter from model.
      @@ -975,10 +1013,10 @@ 

      Returns

      -def get_equal_lists(one_dict, min_len=0, sep=' ', sep_len='last') +def get_equal_lists(one_dict, min_len: int = 0, sep: str = ' ', sep_len='last')
      -

      Transforms all lists to list with the same length, but not less that min_len. +

      Transforms all lists to list with the same length, but not less that min_len. Fills lists with sep. Inplace.

      Parameters

      @@ -991,9 +1029,11 @@

      Parameters

      sep_len : int or "last"
      length of added strings, if "last" than length of added strings is equal to the length of the last string in the list (Default value = "last")
      -
      +
      -Source code + +Expand source code +
      def get_equal_lists(one_dict, min_len: int = 0, sep: str = " ", sep_len="last"):
           """
           Transforms all lists to list with the same length, but not less that min_len.
      @@ -1024,10 +1064,10 @@ 

      Parameters

      -def get_equal_strings(strings, min_len=0, sep=' ') +def get_equal_strings(strings, min_len: int = 0, sep: str = ' ')
      -

      Transforms all strings to strings with the same length, but not less that min_len. +

      Transforms all strings to strings with the same length, but not less that min_len. Fills strings with sep. Inplace.

      Parameters

      @@ -1037,9 +1077,11 @@

      Parameters

      minimal length of the string (Default value = 0)
      sep : str
      filling symbol (Default value = " ")
      -
      +
      -Source code + +Expand source code +
      def get_equal_strings(strings, min_len: int = 0, sep: str = " "):
           """
           Transforms all strings to strings with the same length, but not less that min_len.
      @@ -1063,10 +1105,10 @@ 

      Parameters

      -def get_fix_list(input_list, length, num) +def get_fix_list(input_list: list, length: int, num: int)
      -

      Returns list with strings of size length that contains not more than num strings.

      +

      Returns list with strings of size length that contains not more than num strings.

      Parameters

      input_list : list
      @@ -1080,9 +1122,11 @@

      Returns

      list
      list with no more than num of beautiful strings
      -
      +
      -Source code + +Expand source code +
      def get_fix_list(input_list: list, length: int, num: int):
           """
           Returns list with strings of size length that contains not more than num strings.
      @@ -1130,10 +1174,10 @@ 

      Returns

      -def get_fix_string(input_string, length) +def get_fix_string(input_string: str, length: int)
      -

      Transforms input_string to the string of the size length.

      +

      Transforms input_string to the string of the size length.

      Parameters

      input_string : str
      @@ -1145,9 +1189,11 @@

      Returns

      str
      beautiful string of the size length
      -
      +
      -Source code + +Expand source code +
      def get_fix_string(input_string: str, length: int):
           """
           Transforms input_string to the string of the size length.
      @@ -1184,19 +1230,21 @@ 

      Returns

      def get_public_instance_attributes(instance)
      -

      Get list of all instance public atrributes.

      +

      Get list of all instance public atrributes.

      Parameters

      -
      instance : optional
      +
      instance : optional
       

      Returns

      list of str
       
      -
      +
      -Source code + +Expand source code +
      def get_public_instance_attributes(instance):
           """
           Get list of all instance public atrributes.
      @@ -1221,14 +1269,16 @@ 

      Returns

      def get_timestamp_in_str_format()
      -

      Returns current timestamp.

      +

      Returns current timestamp.

      Returns

      str
      timestamp in "%Hh%Mm%Ss_%dd%mm%Yy" format
      -
      +
      -Source code + +Expand source code +
      def get_timestamp_in_str_format():
           """
           Returns current timestamp.
      @@ -1248,7 +1298,7 @@ 

      Returns

      def is_acceptable(model, requirement_lesser, requirement_greater, requirement_equal)
      -

      Checks if model suits request.

      +

      Checks if model suits request.

      Parameters

      model : TopicModel
      @@ -1264,9 +1314,11 @@

      Returns

      bool
       
      -
      +
      -Source code + +Expand source code +
      def is_acceptable(model, requirement_lesser, requirement_greater, requirement_equal):
           """
           Checks if model suits request.
      @@ -1307,19 +1359,21 @@ 

      Returns

      def is_jsonable(x)
      -

      Check that x is jsonable

      +

      Check that x is jsonable

      Parameters

      -
      x : optional
      +
      x : optional
       

      Returns

      bool
       
      -
      +
      -Source code + +Expand source code +
      def is_jsonable(x):
           """
           Check that x is jsonable
      @@ -1344,9 +1398,11 @@ 

      Returns

      def is_saveable_model(model=None, model_id=None, experiment=None)
      -

      Little helpful function. May be extended later.

      +

      Little helpful function. May be extended later.

      -Source code + +Expand source code +
      def is_saveable_model(model=None, model_id=None, experiment=None):
           """
           Little helpful function. May be extended later.
      @@ -1368,11 +1424,43 @@ 

      Returns

      return isinstance(model, SUPPORTED_MODEL_CLASSES)
      +
      +def load_models_from_disk(experiment_directory, base_experiment_name) +
      +
      +

      Is useful for restoring failed experiment

      +
      + +Expand source code + +
      def load_models_from_disk(experiment_directory, base_experiment_name):
      +    """
      +    Is useful for restoring failed experiment
      +    """
      +    from topicnet.cooking_machine.experiment import START
      +    from topicnet.cooking_machine.models import DummyTopicModel
      +
      +    result_models = []
      +
      +    mask = f"{experiment_directory}/{base_experiment_name}_*"
      +    msg = (f'Trying to load models from {mask}.'
      +           f' {len(glob.glob(mask))} models found.')
      +    print(msg)
      +    for folder in glob.glob(mask):
      +        model_pathes = [
      +            f.path for f in os.scandir(folder)
      +            if f.is_dir() and f.name != START
      +        ]
      +        result_models += [DummyTopicModel.load(path) for path in model_pathes]
      +
      +    return result_models
      +
      +
      -def parse_query_string(query_string) +def parse_query_string(query_string: str)
      -

      This function will parse query string and subdivide it into following parts:

      +

      This function will parse query string and subdivide it into following parts:

      Parameters

      query_string : str
      @@ -1390,9 +1478,11 @@

      Returns

       
      extremum : str
       
      -
      +
      -Source code + +Expand source code +
      def parse_query_string(query_string: str):
           """
           This function will parse query string and subdivide it into following parts:
      @@ -1460,18 +1550,20 @@ 

      Returns

      def transform_complex_entity_to_dict(some_entity)
      -

      Parameters

      +

      Parameters

      -
      some_entity : optional
      +
      some_entity : optional
       

      Returns

      dict
      jsonable entity
      -
      +
      -Source code + +Expand source code +
      def transform_complex_entity_to_dict(some_entity):
           """
       
      @@ -1505,7 +1597,7 @@ 

      Returns

      def transform_topic_model_description_to_jsonable(obj)
      -

      Change object to handle serialization problems with json.

      +

      Change object to handle serialization problems with json.

      Parameters

      obj : object
      @@ -1515,9 +1607,11 @@

      Returns

      int
      jsonable object
      -
      +
      -Source code + +Expand source code +
      def transform_topic_model_description_to_jsonable(obj):
           """
           Change object to handle serialization problems with json.
      @@ -1588,6 +1682,7 @@ 

      Index

    • is_acceptable
    • is_jsonable
    • is_saveable_model
    • +
    • load_models_from_disk
    • parse_query_string
    • transform_complex_entity_to_dict
    • transform_topic_model_description_to_jsonable
    • @@ -1597,7 +1692,7 @@

      Index

      diff --git a/docs/dataset_manager/api.html b/docs/dataset_manager/api.html index f3636a2..764a318 100644 --- a/docs/dataset_manager/api.html +++ b/docs/dataset_manager/api.html @@ -3,14 +3,14 @@ - + topicnet.dataset_manager.api API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.dataset_manager.api

      -Source code + +Expand source code +
      import gzip
       import os
       import pandas as pd
      @@ -41,7 +43,7 @@ 

      Module topicnet.dataset_manager.api

      from ..cooking_machine.dataset import Dataset -_SERVER_URL = 'https://93.175.29.159:8085' +_SERVER_URL = 'https://topicnet-datasets.machine-intelligence.ru' _ARCHIVE_EXTENSION = '.gz' _DEFAULT_DATASET_FILE_EXTENSION = '.csv' @@ -92,12 +94,14 @@

      Module topicnet.dataset_manager.api

      dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_name) try: + print(f'Checking if dataset "{dataset_name}" was already downloaded before') + saved_dataset = _init_dataset_if_downloaded(dataset_path, **kwargs) except FileNotFoundError: - pass + print(f'Dataset "{dataset_name}" not found on the machine') else: print( - f'Dataset already downloaded!' + f'Dataset is found on the machine.' f' Save path is: "{saved_dataset._data_path}"' ) @@ -111,6 +115,8 @@

      Module topicnet.dataset_manager.api

      print(f'Downloading the "{dataset_name}" dataset...') + save_path = None + try: with urlopen(req, data=data, context=context) as answer: total_size = int(answer.headers.get('content-length', 0)) @@ -133,7 +139,7 @@

      Module topicnet.dataset_manager.api

      if total_size != 0 and t.n != total_size: raise RuntimeError( - "Failed to download dataset!" + "Failed to download the dataset!" " Some data was lost during network transfer" ) @@ -146,19 +152,19 @@

      Module topicnet.dataset_manager.api

      return Dataset(save_path, **kwargs) except Exception as exception: - if os.path.isfile(save_path): + if save_path is not None and os.path.isfile(save_path): os.remove(save_path) raise exception finally: - if os.path.isfile(save_path + _ARCHIVE_EXTENSION): + if save_path is not None and os.path.isfile(save_path + _ARCHIVE_EXTENSION): os.remove(save_path + _ARCHIVE_EXTENSION) def _init_dataset_if_downloaded(dataset_path: str, **kwargs) -> Dataset: saved_dataset_path_candidates = [ - p for p in glob(dataset_path + '*') + p for p in glob(dataset_path + '.*') if os.path.isfile(p) and not p.endswith(_ARCHIVE_EXTENSION) ] dataset = None @@ -185,25 +191,27 @@

      Module topicnet.dataset_manager.api

      Functions

      -def get_info() +def get_info() -> str
      -

      Gets info about all datasets.

      +

      Gets info about all datasets.

      Returns

      -
      str with MarkDown syntax
      +
      str with MarkDown syntax
       

      Examples

      As the return value is MarkDown text, in Jupyter Notebook one may do the following to format the output information nicely

      -
      >>> from IPython.display import Markdown
      +
      >>> from IPython.display import Markdown
       ...
       >>> Markdown(get_info())
      -
      +
      -Source code + +Expand source code +
      def get_info() -> str:
           """
           Gets info about all datasets.
      @@ -231,24 +239,24 @@ 

      Examples

      -def load_dataset(dataset_name, **kwargs) +def load_dataset(dataset_name: str, **kwargs) -> Dataset
      -

      Load dataset by dataset_name. -Run <a title="topicnet.dataset_manager.api.get_info" href="#topicnet.dataset_manager.api.get_info">get_info()</a> to get dataset information

      +

      Load dataset by dataset_name. +Run get_info() to get dataset information

      Parameters

      dataset_name : str
      dataset name for download

      Another Parameters

      -
      -
      kwargs
      -
      optional properties of -:class:~topicnet.cooking_machine.Dataset
      -
      +

      kwargs +optional properties of +:class:~topicnet.cooking_machine.Dataset

      -Source code + +Expand source code +
      def load_dataset(dataset_name: str, **kwargs) -> Dataset:
           """
           Load dataset by dataset_name.
      @@ -269,12 +277,14 @@ 

      Another Parameters

      dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_name) try: + print(f'Checking if dataset "{dataset_name}" was already downloaded before') + saved_dataset = _init_dataset_if_downloaded(dataset_path, **kwargs) except FileNotFoundError: - pass + print(f'Dataset "{dataset_name}" not found on the machine') else: print( - f'Dataset already downloaded!' + f'Dataset is found on the machine.' f' Save path is: "{saved_dataset._data_path}"' ) @@ -288,6 +298,8 @@

      Another Parameters

      print(f'Downloading the "{dataset_name}" dataset...') + save_path = None + try: with urlopen(req, data=data, context=context) as answer: total_size = int(answer.headers.get('content-length', 0)) @@ -310,7 +322,7 @@

      Another Parameters

      if total_size != 0 and t.n != total_size: raise RuntimeError( - "Failed to download dataset!" + "Failed to download the dataset!" " Some data was lost during network transfer" ) @@ -323,13 +335,13 @@

      Another Parameters

      return Dataset(save_path, **kwargs) except Exception as exception: - if os.path.isfile(save_path): + if save_path is not None and os.path.isfile(save_path): os.remove(save_path) raise exception finally: - if os.path.isfile(save_path + _ARCHIVE_EXTENSION): + if save_path is not None and os.path.isfile(save_path + _ARCHIVE_EXTENSION): os.remove(save_path + _ARCHIVE_EXTENSION)
      @@ -359,7 +371,7 @@

      Index

      diff --git a/docs/dataset_manager/index.html b/docs/dataset_manager/index.html index d07624e..3b81a4b 100644 --- a/docs/dataset_manager/index.html +++ b/docs/dataset_manager/index.html @@ -3,14 +3,14 @@ - + topicnet.dataset_manager API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.dataset_manager

      -Source code + +Expand source code +
      from .api import (
           get_info,
           load_dataset,
      @@ -33,7 +35,7 @@ 

      Sub-modules

      topicnet.dataset_manager.api
      -
      +
      @@ -64,7 +66,7 @@

      Index

      diff --git a/docs/index.html b/docs/index.html index a1a46e2..dc0efc1 100644 --- a/docs/index.html +++ b/docs/index.html @@ -3,36 +3,27 @@ - + topicnet API documentation - - + +
      -

      Module topicnet

      +

      Package topicnet

      -

      TopicNet

      -

      The library was created to assist in the task of building topic models. It aims to automate away many routine tasks related to topic model training, allowing a user to focus on the task at hand. Also, it provides additional tools to construct advanced topic models. The library consists of the following modules:

      -
        -
      • cooking_machine — provides tools to design a topic model construction pipeline, or experiment with regularizers fitting
      • -
      • viewers — provides information about the topic model in an accessible format
      • -
      • demos — demo .ipynb notebooks
      • -
      • dataset_manager — gives opportunity to download datasets for experiments
      • -
      • tests — provides a user with means to test library functionality (contains some examples of intended library usage)
      • -
      -

      Project description

      -

      In TopicNet framework, advanced topic models are build using Experiment class. An experiment consists of stages (that we call “cubes”) which perform actions over the “models” which are objects of the Experiment. The experiment instance of Experiment class contains all the information about the experiment process and automatically updates its log when a cube is applied to the last level models. It is worth noting that the experiment is linear, meaning it does not support multiple different cubes at the same stage of the experiment. If that need arises one is recommended to create a new experiment with a new cube on the last level. The experiment instance of Experiment class contains all the information about the experiment process and automatically updates its log when the cube is applied to the last level models. Summarizing: the key entity Experiment is a sequence of cubes that produce models on each stage of the experiment process

      -Source code + +Expand source code +
      import artm
       
       # change log style
      @@ -49,15 +40,15 @@ 

      Sub-modules

      topicnet.cooking_machine
      -
      +
      topicnet.dataset_manager
      -
      +
      topicnet.viewers
      -
      +
      @@ -69,7 +60,7 @@

      Sub-modules

      diff --git a/docs/viewers/base_viewer.html b/docs/viewers/base_viewer.html index 93fa6f2..8352072 100644 --- a/docs/viewers/base_viewer.html +++ b/docs/viewers/base_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.base_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.base_viewer

      -Source code + +Expand source code +
      from ..cooking_machine.models.base_model import BaseModel
       
       
      @@ -64,9 +66,11 @@ 

      Classes

      (model)
      -
      +
      -Source code + +Expand source code +
      class BaseViewer:
           """ """
           def __init__(self, model):
      @@ -94,21 +98,23 @@ 

      Classes

      Subclasses

      Instance variables

      var model
      -
      +
      -Source code + +Expand source code +
      @property
       def model(self):
           """ """
      @@ -122,14 +128,16 @@ 

      Methods

      def view(self, *args, **kwargs)
      -

      Main method of viewer.

      +

      Main method of viewer.

      Returns

      -
      optional
      +
      optional
       
      -
      +
      -Source code + +Expand source code +
      def view(self, *args, **kwargs):
           """
           Main method of viewer.
      @@ -173,7 +181,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/document_cluster.html b/docs/viewers/document_cluster.html index 09e5961..f9d17dd 100644 --- a/docs/viewers/document_cluster.html +++ b/docs/viewers/document_cluster.html @@ -3,14 +3,14 @@ - + topicnet.viewers.document_cluster API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.document_cluster

      -Source code + +Expand source code +
      import numpy as np
       import colorlover as cl
       import plotly.graph_objs as go
      @@ -181,14 +183,16 @@ 

      Classes

      (model)
      -

      This viewer performs dimesionality reduction over document embeddings

      +

      This viewer performs dimesionality reduction over document embeddings

      Parameters

      model : TopicModel
       
      -
      +
      -Source code + +Expand source code +
      class DocumentClusterViewer(BaseViewer):
           """
           This viewer performs dimesionality reduction over document embeddings
      @@ -330,10 +334,10 @@ 

      Ancestors

      Methods

      -def viev_from_jupyter(self, dataset, method='TSNE', save_path='DocumentCluster_view.html', width=800, height=600, display_output=True, give_html=False) +def viev_from_jupyter(self, dataset, method: str = 'TSNE', save_path: str = 'DocumentCluster_view.html', width: int = 800, height: int = 600, display_output: bool = True, give_html: bool = False)
      -

      Parameters

      +

      Parameters

      dataset : Dataset
       
      @@ -359,9 +363,11 @@

      Returns

      out_html : string
      an html string containing the plotly graph returned only if give_html is True
      -
      +
      -Source code + +Expand source code +
      def viev_from_jupyter(
           self,
           dataset,
      @@ -418,7 +424,7 @@ 

      Returns

      def view(self, dataset, save_path, method='TSNE', to_html=True)
      -

      Parameters

      +

      Parameters

      dataset : Dataset
       
      @@ -431,14 +437,16 @@

      Returns

      Returns

      -
      reduced_data : an np.array of (num_docs, dim) dimensions
      +
      reduced_data : an np.array of (num_docs, dim) dimensions
      reduced dumensions of the original document embeddings
      html_div : string
      an html string containing the plotly graph returned only if to_html is True
      -
      +
      -Source code + +Expand source code +
      def view(
               self,
               dataset,
      @@ -540,7 +548,7 @@ 

      diff --git a/docs/viewers/index.html b/docs/viewers/index.html index b49bbd0..1051a57 100644 --- a/docs/viewers/index.html +++ b/docs/viewers/index.html @@ -3,14 +3,14 @@ - + topicnet.viewers API documentation - - + + @@ -19,81 +19,11 @@

      Module topicnet.viewers

      -

      Viewers

      -

      Module viewers provides information from a topic model allowing to estimate the model quality. Its advantage is in unified call ifrastucture to the topic model making the routine and tedious task of extracting the information easy.

      -

      Currently module contains the following viewers:

      -

      base_viewer (BaseViewer)

      -

      Module responsible for base infrastructure.

      -

      document_cluster (DocumentClusterViewer)

      -

      Module which allows to visualize collection documents. May be slow for large document collections as it uses TSNE algorithm from sklearn library.

      -

      -

      -
          <img src="../docs/images/doc_cluster__plot.png" width="80%" alt/>
      -</div>
      -<em>
      -    Visualisation of reduced document embeddings colored according to their topic made by DocumentClusterViewer.
      -</em>
      -

      -

      spectrum (TopicSpectrumViewer)

      -

      Module contains heuristics for solving TSP to arrange topics minimizing total distance of the spectrum.

      -

      -

      -
          <img src="../docs/images/topic_spectrum__refined_view.png" width="80%" alt/>
      -</div>
      -<em>
      -    Each point on the plot represents some topic.
      -    The viewer helped to calculate such a route between topics when one topic is connected with similar one, and so on, forming a circle.
      -</em>
      -

      -

      top_documents_viewer (TopDocumentsViewer)

      -

      Module with functions that work with dataset document collections.

      -

      -

      -
          <img src="../docs/images/top_doc__view.png" width="80%" alt/>
      -</div>
      -<em>
      -    The viewer shows fragments of top documents corresponding to some topic.
      -</em>
      -

      -

      top_similar_documents_viewer (TopSimilarDocumentsViewer)

      -

      Module containing class for finding similar document for a given one. This viewer helps to estimate homogeneity of clusters given by the model.

      -

      -

      -
          <img src="../docs/images/top_sim_doc__refined_view.png" width="80%" alt/>
      -</div>
      -<em>
      -    Some document from text collection (on top), and documents nearest to it given topic model.
      -    The viewer (currently) gives only document names as output, but the picture is not very difficult to be made.
      -</em>
      -

      -

      top_tokens_viewer (TopTokensViewer)

      -

      Module with class for displaying the most relevant tokens in each topic of the model.

      -

      -

      -
          <img src="../docs/images/top_tokens__view.png" width="80%" alt/>
      -</div>
      -<em>
      -    Output of the TopTokensViewer. Token score in the topic is calculated for every token, score function can be specified at the stage of a viewer initialization.
      -</em>
      -

      -

      topic_mapping (TopicMapViewer)

      -

      Module allowing to compare topics between two different models trained on the same collection.

      -

      -

      -
          <img src="../docs/images/topic_map__view.png" width="80%" alt/>
      -</div>
      -<em>
      -    The mapping between topics of two models (currently only topic names are displayed).
      -</em>
      -

      -

      Deprecated

      -
        -
      • initial_doc_to_topic_viewer — first edition of TopDocumentsViewer

      • -
      • tokens_viewer - first edition of TopTokensViewer

      • -
      -Source code + +Expand source code +
      from .base_viewer import BaseViewer
       from .document_cluster import DocumentClusterViewer
       from .spectrum import TopicSpectrumViewer
      @@ -108,41 +38,41 @@ 

      Sub-modules

      topicnet.viewers.base_viewer
      -
      +
      topicnet.viewers.document_cluster
      -
      +
      topicnet.viewers.initial_doc_to_topic_viewer
      -
      +
      topicnet.viewers.spectrum
      -

      A few ways to obtain "decent" solution to TSP problem +

      A few ways to obtain "decent" solution to TSP problem which returns a spectre of topics in our case.
      -If speed is the essence I recommend to use …

      +If speed is the essence I recommend to use …

      topicnet.viewers.top_documents_viewer
      -
      +
      topicnet.viewers.top_similar_documents_viewer
      -
      +
      topicnet.viewers.top_tokens_viewer
      -
      +
      topicnet.viewers.topic_flow_viewer
      -
      +
      topicnet.viewers.topic_mapping
      -
      +
      @@ -154,7 +84,7 @@

      Sub-modules

      diff --git a/docs/viewers/initial_doc_to_topic_viewer.html b/docs/viewers/initial_doc_to_topic_viewer.html index 7f03d3e..e071870 100644 --- a/docs/viewers/initial_doc_to_topic_viewer.html +++ b/docs/viewers/initial_doc_to_topic_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.initial_doc_to_topic_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.initial_doc_to_topic_viewer
      -Source code + +Expand source code +
      from .base_viewer import BaseViewer
       
       
      @@ -85,9 +87,11 @@ 

      Classes

      (dataset_id, model)
      -
      +
      -Source code + +Expand source code +
      class TopTopicsFeatures(BaseViewer):
           """ """
           def __init__(self, dataset_id, model):
      @@ -143,22 +147,24 @@ 

      Methods

      def view(self, document_id, topic_name=None, batch_vectorizer=None)
      -

      Parameters

      +

      Parameters

      document_id : str
      id of document
      topic_name : str
      (Default value = None)
      -
      batch_vectorizer : optional
      +
      batch_vectorizer : optional
      (Default value = None)

      Returns

      result : dict
       
      -
      +
      -Source code + +Expand source code +
      def view(self, document_id, topic_name=None, batch_vectorizer=None):
           """
       
      @@ -229,7 +235,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/spectrum.html b/docs/viewers/spectrum.html index ce8d61a..ee5e06a 100644 --- a/docs/viewers/spectrum.html +++ b/docs/viewers/spectrum.html @@ -3,7 +3,7 @@ - + topicnet.viewers.spectrum API documentation - - + + @@ -32,7 +32,9 @@

      Module topicnet.viewers.spectrum

      Within a few runs with right temperature selected it can provide a solution better than the initial.

      -Source code + +Expand source code +
      """
       A few ways to obtain "decent" solution to TSP problem
       which returns a spectre of topics in our case.  
      @@ -508,19 +510,21 @@ 

      Functions

      def generate_all_segments(n)
      -

      Generates all segments combinations for 3-opt swap operation.

      +

      Generates all segments combinations for 3-opt swap operation.

      Parameters

      -
      n : int > 5
      +
      n : int > 5
      length of path for fixed endpoint

      Yields

      list of int
       
      -
      +
      -Source code + +Expand source code +
      def generate_all_segments(n):
           """
           Generates all segments combinations for 3-opt swap operation.
      @@ -545,20 +549,22 @@ 

      Yields

      def generate_index_candidates(n)
      -

      Randomly chooses 3 indexes from the path.
      +

      Randomly chooses 3 indexes from the path.
      Does not swap the first or the last point because they fixed.

      Parameters

      -
      n : int > 5
      +
      n : int > 5
      length of the path

      Returns

      segment : list of int
      sorted list of candidates for 3 opt swap optimization
      -
      +
      -Source code + +Expand source code +
      def generate_index_candidates(n):
           """
           Randomly chooses 3 indexes from the path.  
      @@ -593,7 +599,7 @@ 

      Returns

      def generate_three_opt_candidates(path, sequence)
      -

      Generates all possible tour connections and filters out a trivial one.

      +

      Generates all possible tour connections and filters out a trivial one.

      Parameters

      path : np.array of float
      @@ -605,9 +611,11 @@

      Yields

      list of int
      possible tour
      -
      +
      -Source code + +Expand source code +
      def generate_three_opt_candidates(path, sequence):
           """
           Generates all possible tour connections and filters out a trivial one.
      @@ -655,7 +663,7 @@ 

      Yields

      def get_annealed_spectrum(phi_matrix, t_coeff, start_topic=0, metric='jensenshannon', init_path=None, max_iter=1000000, early_stopping=100000)
      -

      Returns annealed spectrum for the topics in the Phi matrix +

      Returns annealed spectrum for the topics in the Phi matrix with default metrics being Jensen-Shannon.

      Parameters

      @@ -682,9 +690,11 @@

      Returns

      best path obtained during the run
      best_score : float
      length of the best path during the run
      -
      +
      -Source code + +Expand source code +
      def get_annealed_spectrum(phi_matrix,
                                 t_coeff,
                                 start_topic=0,
      @@ -766,7 +776,7 @@ 

      Returns

      def get_nearest_neighbour_init(phi_matrix, metric='jensenshannon', start_topic=0)
      -

      Given the matrix calculates the initial path by nearest neighbour heuristic.

      +

      Given the matrix calculates the initial path by nearest neighbour heuristic.

      Parameters

      phi_matrix : np.array of float
      @@ -780,9 +790,11 @@

      Returns

      init_path : list of int
      order of initial topic distribution
      -
      +
      -Source code + +Expand source code +
      def get_nearest_neighbour_init(phi_matrix, metric='jensenshannon', start_topic=0):
           """
           Given the matrix calculates the initial path by nearest neighbour heuristic.
      @@ -824,7 +836,7 @@ 

      Returns

      def get_three_opt_path(path, distance_m, max_iter=20)
      -

      Iterative improvement based on 3 opt exchange.

      +

      Iterative improvement based on 3 opt exchange.

      Parameters

      path : list of int
      @@ -839,9 +851,11 @@

      Returns

      path : list of int
      end optimization of the route
      -
      +
      -Source code + +Expand source code +
      def get_three_opt_path(path, distance_m, max_iter=20):
           """
           Iterative improvement based on 3 opt exchange.
      @@ -882,7 +896,7 @@ 

      Returns

      def make_three_opt_swap(path, distance_m, sequence, temperature=None)
      -

      Performs swap based on the selection candidates, +

      Performs swap based on the selection candidates, allows for non-optimal solution to be accepted based on Boltzman distribution.

      Parameters

      @@ -905,9 +919,11 @@

      Returns

      best path after the permutation
      val : float
      a value gained after the path permutation
      -
      +
      -Source code + +Expand source code +
      def make_three_opt_swap(path, distance_m, sequence, temperature=None):
           """
           Performs swap based on the selection candidates,
      @@ -975,7 +991,7 @@ 

      Classes

      (model, t_coeff=100000.0, start_topic=0, metric='jensenshannon', init_path=None, max_iter=1000000, early_stopping=100000, verbose=False, class_ids=None)
      -

      Class providing wrap around for functions +

      Class providing wrap around for functions that allow to view a collection of topics in order of their similarity to each other.

      Parameters

      @@ -1001,9 +1017,11 @@

      Parameters

      parameter for model.get_phi method contains list of modalities to obtain from the model (Default value = None)
      -
      +
      -Source code + +Expand source code +
      class TopicSpectrumViewer(BaseViewer):
           def __init__(
               self,
      @@ -1157,7 +1175,7 @@ 

      Methods

      def view(self, class_ids=None)
      -

      The class method returning ordered spectrum of +

      The class method returning ordered spectrum of the topics.

      Parameters

      @@ -1166,9 +1184,11 @@

      Parameters

      contains list of modalities to obtain from the model (Default value = None)
      ordered_topics : list of str
      topic names from the model ordered as spectrum
      -

      +
      -Source code + +Expand source code +
      def view(self, class_ids=None):
           """
           The class method returning ordered spectrum of
      @@ -1208,7 +1228,7 @@ 

      Parameters

      def view_from_jupyter(self, class_ids=None, display_output=True, give_html=False, **kwargs)
      -

      TopicSpectrumViewer method recommended for use +

      TopicSpectrumViewer method recommended for use from jupyter notebooks returns ordered list of topics minimizing path that connects all of them in topic space @@ -1228,12 +1248,12 @@

      Returns

      html string of the output

      Another Parameters

      -
      -
      **kwargs
      -
      kwargs are optional ~.TopTokenViewer properties
      -
      +

      kwargs +kwargs are optional ~.TopTokenViewer properties

      -Source code + +Expand source code +
      def view_from_jupyter(
               self,
               class_ids=None,
      @@ -1334,7 +1354,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/top_documents_viewer.html b/docs/viewers/top_documents_viewer.html index 5b95f29..3977fb2 100644 --- a/docs/viewers/top_documents_viewer.html +++ b/docs/viewers/top_documents_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.top_documents_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.top_documents_viewer

      -Source code + +Expand source code +
      import numpy as np
       
       from collections import defaultdict
      @@ -348,7 +350,7 @@ 

      Functions

      def compute_cluster_top_objects_by_distance(precomputed_distances, max_top_number=10, object_clusters=None)
      -

      Compute the most representative objects for each cluster +

      Compute the most representative objects for each cluster using the precomputed_distances.

      Parameters

      @@ -358,16 +360,18 @@

      Parameters

      max_top_number : int
      maximum number of top objects of cluster (resulting number can be less than it) (Default value = 10)
      -
      object_clusters : np,array
      +
      object_clusters : np,array
      array of shape n_objects - precomputed clusters for objects

      Returns

      -
      clusters_top_objects : list of list of indexes
      +
      clusters_top_objects : list of list of indexes
      (Default value = None)
      -
      +
      -Source code + +Expand source code +
      def compute_cluster_top_objects_by_distance(precomputed_distances,
                                                   max_top_number=10,
                                                   object_clusters=None):
      @@ -434,7 +438,7 @@ 

      Returns

      def predict_cluster_by_precomputed_distances(precomputed_distances)
      -

      Predict a cluster for each object with precomputed distances.

      +

      Predict a cluster for each object with precomputed distances.

      Parameters

      precomputed_distances : np.array
      @@ -444,9 +448,11 @@

      Returns

      np.array
      array of length X.shape[0], each element is cluster of ith object
      -
      +
      -Source code + +Expand source code +
      def predict_cluster_by_precomputed_distances(precomputed_distances):
           """
           Predict a cluster for each object with precomputed distances.
      @@ -466,20 +472,20 @@ 

      Returns

      -def prepare_html_string(document, num_sentences_in_snippet=4, num_words=15) +def prepare_html_string(document, num_sentences_in_snippet: int = 4, num_words: int = 15)
      -

      Prepares basic version of raw html +

      Prepares basic version of raw html representing the document. Takes title (document_id) and combines it with portion of the document text (first few sentences) also makes sure that every line contains same number of words

      Parameters

      -
      document : Padas.DataFrame row
      +
      document : Padas.DataFrame row
      a row that contains columns raw_text and index in string form
      -
      distance : float between 0 and 1
      +
      distance : float between 0 and 1
      measure of how close found document to the initial inquiry
      num_sentences_in_snippet
      @@ -492,9 +498,11 @@

      Returns

      doc_html : str
       
      -
      +
      -Source code + +Expand source code +
      def prepare_html_string(
           document,
           num_sentences_in_snippet: int = 4,
      @@ -542,7 +550,7 @@ 

      Returns

      def transform_cluster_objects_list_to_dict(object_clusters)
      -

      Transforms list of object clusters to dict.

      +

      Transforms list of object clusters to dict.

      Parameters

      object_clusters : list
      @@ -552,9 +560,11 @@

      Returns

      clusters : dict
      dict, where key is clusterlabel (int), value is cluster objects (list)
      -
      +
      -Source code + +Expand source code +
      def transform_cluster_objects_list_to_dict(object_clusters):
           """
           Transforms list of object clusters to dict.
      @@ -590,7 +600,7 @@ 

      Classes

      (model, dataset=None, precomputed_distances=None, object_clusters=None, max_top_number=10)
      -

      The class provide information about +

      The class provide information about top documents for the model topics from some collection.

      Parameters

      @@ -599,7 +609,7 @@

      Parameters

      a class of topic model
      dataset : Dataset
      a class that stores information about the collection
      -
      precomputed_distances :  np.array
      +
      precomputed_distances :  np.array
      array of shape (n_topics, n_objects) - an optional matrix of pairwise distances: distance from ith cluster centroid to the jth object
      @@ -609,9 +619,11 @@

      Parameters

      ith element of list is cluster of ith object
      max_top_number : int
      number of top documents to provide for each cluster
      -
      +
      -Source code + +Expand source code +
      class TopDocumentsViewer(BaseViewer):
           """ """
           def __init__(self,
      @@ -782,7 +794,7 @@ 

      Methods

      def view(self, current_num_top_doc=None, topic_names=None)
      -

      Returns list of tuples (token,score) for +

      Returns list of tuples (token,score) for each topic in the model.

      Parameters

      @@ -798,9 +810,11 @@

      Returns

      returns dict for each topic of the model dict contains document_ids of top documents for that topic and their probability of belonging to the topic
      -
      +
      -Source code + +Expand source code +
      def view(
           self,
           current_num_top_doc=None,
      @@ -875,10 +889,10 @@ 

      Returns

      -def view_from_jupyter(self, current_num_top_doc=None, topic_names=None, display_output=True, give_html=False) +def view_from_jupyter(self, current_num_top_doc: int = None, topic_names: list = None, display_output: bool = True, give_html: bool = False)
      -

      TopDocumentsViewer method recommended for use +

      TopDocumentsViewer method recommended for use from jupyter notebooks Returns texts of the actual documents.

      Parameters

      @@ -897,9 +911,11 @@

      Returns

      html_output
      html string of the output
      -
      +
      -Source code + +Expand source code +
      def view_from_jupyter(
               self,
               current_num_top_doc: int = None,
      @@ -991,7 +1007,7 @@ 

      diff --git a/docs/viewers/top_similar_documents_viewer.html b/docs/viewers/top_similar_documents_viewer.html index 6cbeec5..bab1db1 100644 --- a/docs/viewers/top_similar_documents_viewer.html +++ b/docs/viewers/top_similar_documents_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.top_similar_documents_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.top_similar_documents_viewer
      -Source code + +Expand source code +
      import numpy as np
       import warnings
       
      @@ -591,17 +593,17 @@ 

      Module topicnet.viewers.top_similar_documents_viewerFunctions

      -def prepare_doc_html_with_similarity(document, distance, num_digits=3, num_sentences_in_snippet=4, num_words=15) +def prepare_doc_html_with_similarity(document, distance, num_digits: int = 3, num_sentences_in_snippet: int = 4, num_words: int = 15)
      -

      Prepares intital document and search results +

      Prepares intital document and search results html strings

      Parameters

      -
      document : Padas.DataFrame row
      +
      document : Padas.DataFrame row
      a row that contains columns raw_text and index in string form
      -
      distance : float between 0 and 1
      +
      distance : float between 0 and 1
      measure of how close found document to the initial inquiry
      num_digits
      @@ -617,9 +619,11 @@

      Returns

      doc_html : str
      an html string with data about document plus additional info for the output clarification
      -
      +
      -Source code + +Expand source code +
      def prepare_doc_html_with_similarity(
           document,
           distance,
      @@ -681,16 +685,18 @@ 

      Classes

      (model, dataset)
      -

      Viewer which uses topic model to find documents similar to given one

      +

      Viewer which uses topic model to find documents similar to given one

      Parameters

      model : BaseModel
      Topic model
      dataset : BaseDataset
      Dataset with information about documents
      -
      +
      -Source code + +Expand source code +
      class TopSimilarDocumentsViewer(BaseViewer):
           def __init__(self, model, dataset):
               """Viewer which uses topic model to find documents similar to given one
      @@ -1092,7 +1098,7 @@ 

      Methods

      def view(self, document_id, metric='jensenshannon', num_top_similar=5, keep_similar_by_words=True)
      -

      Shows documents similar to given one by distribution of topics

      +

      Shows documents similar to given one by distribution of topics

      Parameters

      document_id
      @@ -1110,11 +1116,13 @@

      Parameters

      Returns

      -
      tuple(list, list)
      +
      tuple(list, list)
      Top similar words, and corresponding distances to given document
      -
      +
      -Source code + +Expand source code +
      def view(self,
                document_id,
                metric='jensenshannon',
      @@ -1163,10 +1171,10 @@ 

      Returns

      -def view_from_jupyter(self, document_id, metric='jensenshannon', num_top_similar=5, num_digits=3, keep_similar_by_words=True, display_output=True, give_html=False) +def view_from_jupyter(self, document_id: str, metric: str = 'jensenshannon', num_top_similar: int = 5, num_digits: int = 3, keep_similar_by_words: bool = True, display_output: bool = True, give_html: bool = False)
      -

      Method for viewing documents similar to requested one +

      Method for viewing documents similar to requested one from jupyter notebook. Provides document titles and snippets of first few sentences.

      Parameters

      @@ -1192,9 +1200,11 @@

      Returns

      topic_html
      html string of the generated output
      -
      +
      -Source code + +Expand source code +
      def view_from_jupyter(
               self,
               document_id: str,
      @@ -1293,7 +1303,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/top_tokens_viewer.html b/docs/viewers/top_tokens_viewer.html index e1cc2a1..13ec518 100644 --- a/docs/viewers/top_tokens_viewer.html +++ b/docs/viewers/top_tokens_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.top_tokens_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.top_tokens_viewer

      -Source code + +Expand source code +
      import bisect
       import numpy as np
       import pandas as pd
      @@ -714,7 +716,7 @@ 

      Functions

      def compute_blei_scores(phi)
      -

      Computes Blei score
      +

      Computes Blei score
      phi[wt] * [log(phi[wt]) - 1/T sum_k log(phi[wk])]

      Parameters

      @@ -725,9 +727,11 @@

      Returns

      score : pd.DataFrame
      weighted phi matrix
      -
      +
      -Source code + +Expand source code +
      def compute_blei_scores(phi):
           """
           Computes Blei score  
      @@ -764,7 +768,7 @@ 

      Returns

      def compute_clusters_top_tokens_by_clusters_tfidf(objects_cluster, objects_content, max_top_number=10, n_topics=None)
      -

      Function for document-like clusters.
      +

      Function for document-like clusters.
      For each cluster compute top tokens of cluster. Top tokens are defined by tf-idf scheme. Tf-idf is computed as if clusters is concatenation of all it documents.

      Parameters

      @@ -782,11 +786,13 @@

      Parameters

      Returns

      -
      clusters_top_tokens : list of list of str:
      +
      clusters_top_tokens : list of list of str:
      ith element of list is list of top tokens of ith cluster
      -
      +
      -Source code + +Expand source code +
      def compute_clusters_top_tokens_by_clusters_tfidf(
               objects_cluster, objects_content,
               max_top_number=10, n_topics=None):
      @@ -861,7 +867,7 @@ 

      Returns

      def compute_joint_pwt_distribution(phi, p_t)
      -

      p(t) is prob(topic = t), defined as p(t) = sum_t n_t / n +

      p(t) is prob(topic = t), defined as p(t) = sum_t n_t / n

      if we fix some word w, we can calculate weighted_pk:
      wp_t = p(t) p(w|t)

      @@ -877,9 +883,11 @@

      Returns

      joint_pwt : np.array of float
      array of probabilities that a fixed token from the collection belongs to that topic
      -
      +
      -Source code + +Expand source code +
      def compute_joint_pwt_distribution(phi, p_t):
           """
           p(t) is prob(topic = t), defined as p(t) = sum_t n_t / n  
      @@ -910,7 +918,7 @@ 

      Returns

      def compute_likelihood_vectorised(phi, p_t, joint_pwt)
      -

      Likelihood ratio is defined as
      +

      Likelihood ratio is defined as
      L = phi_wt / sum_k p(k)/p(!t) phi_wk
      equivalently:
      L = phi_wt * p(!t) / sum_k!=t p(k) phi_wk
      @@ -932,9 +940,11 @@

      Returns

      target_values : np.array of float
      vector of likelihood ratios that tokens belong to the given topic
      -
      +
      -Source code + +Expand source code +
      def compute_likelihood_vectorised(phi, p_t, joint_pwt):
           """
           Likelihood ratio is defined as  
      @@ -984,7 +994,7 @@ 

      Returns

      def compute_pt_distribution(model, class_ids=None)
      -

      Calculates the Prob(t) vector (vector contains an entry for each topic).

      +

      Calculates the Prob(t) vector (vector contains an entry for each topic).

      Parameters

      model : TopicModel
      @@ -995,11 +1005,13 @@

      Parameters

      Returns

      -
      float probability that a random token from the collection belongs to that topic
      +
      float probability that a random token from the collection belongs to that topic
       
      -
      +
      -Source code + +Expand source code +
      def compute_pt_distribution(model, class_ids=None):
           """
           Calculates the Prob(t) vector (vector contains an entry for each topic).
      @@ -1028,9 +1040,11 @@ 

      Returns

      def compute_ptw(joint_pwt)
      -
      +
      -Source code + +Expand source code +
      def compute_ptw(joint_pwt):
           return joint_pwt / np.sum(joint_pwt, axis=0)  # sum by all T
      @@ -1039,9 +1053,11 @@

      Returns

      def convert_df_to_html(df)
      -
      +
      -Source code + +Expand source code +
      def convert_df_to_html(df):
           return df.style\
                      .set_table_attributes("style='display:inline'")\
      @@ -1052,7 +1068,7 @@ 

      Returns

      def get_top_values(values, top_number)
      -

      Returns top_number top values from the matrix for each column.

      +

      Returns top_number top values from the matrix for each column.

      Parameters

      values : np.array
      @@ -1066,9 +1082,11 @@

      Returns

      array of top_number top values for each column of the initial array
      top_indexes : nd.array
      array of original indexes for top_values array (Default value = True)
      -
      +
      -Source code + +Expand source code +
      def get_top_values(values, top_number):
           """
           Returns top_number top values from the matrix for each column.
      @@ -1111,7 +1129,7 @@ 

      Returns

      def get_top_values_by_sum(values, min_sum_value)
      -

      Returns top values until sum of their scores breaches min_sum_value.

      +

      Returns top values until sum of their scores breaches min_sum_value.

      Parameters

      values : np.array
      @@ -1127,16 +1145,15 @@

      Returns

      array of original indexes for top_values array (Default value = True)

      Examples

      -
      >>> values = np.array([1, 3, 2, 0.1, 5, 0])
      +
      >>> values = np.array([1, 3, 2, 0.1, 5, 0])
       >>> min_sum = 8.1
       >>> top_values, top_indexes = get_top_values_by_sum(values, min_sum)
      -**`Result`** :&ensp;`top_values`, `top_indexes` = (`array`([`5.`, `3.`, `2.`]), `array`([`4`, `1`, `2`]))
      -
      -
      -
       
      -
      +Result: top_values, top_indexes = (array([5., 3., 2.]), array([4, 1, 2])) +
      -Source code + +Expand source code +
      def get_top_values_by_sum(values, min_sum_value,):
           """
           Returns top values until sum of their scores breaches `min_sum_value`.
      @@ -1186,10 +1203,10 @@ 

      Classes

      class TopTokensViewer -(model, class_ids=None, method='blei', num_top_tokens=10, alpha=1, by_sum=False, sum_value=None, dataset=None) +(model, class_ids: List[str] = None, method: str = 'blei', num_top_tokens: int = 10, alpha: float = 1, by_sum: bool = False, sum_value: float = None, dataset=None)
      -

      Gets top tokens from topic (sorted by scores)

      +

      Gets top tokens from topic (sorted by scores)

      The class provide information about top tokens of the model topics providing with different methods to score that.

      Parameters

      @@ -1207,7 +1224,7 @@

      Parameters

      ptw - something like likelihood
      num_top_tokens : int
      number of top tokens to provide for each topic
      -
      alpha : float between 0 and 1
      +
      alpha : float between 0 and 1
      additional constant needed for ptw method of scoring
      by_sum
      @@ -1218,9 +1235,11 @@

      Parameters

      a good default value might be different depending on self.method value
      dataset : Dataset
      a class that stores infromation about the collection
      -
      +
      -Source code + +Expand source code +
      class TopTokensViewer(BaseViewer):
           """Gets top tokens from topic (sorted by scores)"""
           def __init__(self,
      @@ -1611,9 +1630,11 @@ 

      Instance variables

      var cached_top_tokens
      -
      +
      -Source code + +Expand source code +
      @property
       def cached_top_tokens(self):
           if self._cached_top_tokens is None:
      @@ -1625,12 +1646,14 @@ 

      Instance variables

      Methods

      -def to_df(self, topic_names=None, digits=5) +def to_df(self, topic_names: Iterator[str] = None, digits: int = 5) -> pandas.core.frame.DataFrame
      -
      +
      -Source code + +Expand source code +
      def to_df(self, topic_names: Iterator[str] = None, digits: int = 5) -> pd.DataFrame:
           topic_top_tokens = self.cached_top_tokens
       
      @@ -1651,24 +1674,24 @@ 

      Methods

      -def to_html(self, topic_names=None, digits=5, thresh=None, horizontally_stack=True) +def to_html(self, topic_names: Union[str, List[str]] = None, digits: int = 5, thresh: float = None, horizontally_stack: bool = True) -> str
      -

      Generates html version of dataframes to be displayed by Jupyter notebooks

      +

      Generates html version of dataframes to be displayed by Jupyter notebooks

      Parameters

      topic_names : list of strings
      Initial dictionary keys
      digits : int
      Number of digits to round each probability to
      -
      thresh : float [Deprecated]
      +
      thresh : float [Deprecated]
      Threshold used for calculating digits and throwing out too low probabilities
      horizontally_stack : bool
      if True, then tokens for each modality will be stacked horizontally (instead of being a single long multi-line DataFrame)

      Examples

      -
      >>> from IPython.display import HTML, display_html
      +
      >>> from IPython.display import HTML, display_html
       >>>
       >>> # model training here
       >>> # ...
      @@ -1676,9 +1699,11 @@ 

      Examples

      >>> display_html(viewer.to_html(), raw=True) >>> # or >>> HTML(viewer.to_html()) -
      +
      -Source code + +Expand source code +
      def to_html(
               self,
               topic_names: Union[str, List[str]] = None,
      @@ -1749,10 +1774,10 @@ 

      Examples

      -def view(self, class_ids=None, raw_data=None, three_levels=True) +def view(self, class_ids: List[str] = None, raw_data: List[List[str]] = None, three_levels: bool = True) -> Union[Dict[str, Dict[str, Dict[str, float]]], Dict[str, Dict[Tuple[str, str], float]]]
      -

      Returns list of tuples (token, score) for each topic in the model.

      +

      Returns list of tuples (token, score) for each topic in the model.

      Parameters

      class_ids
      @@ -1762,14 +1787,16 @@

      Parameters

      three_levels
      If true, three level dict will be returned, otherwise — two level one
      -

      returns

      +

      Returns

      -
      topic_top_tokens : nested 3 or 2-level dict
      +
      topic_top_tokens : nested 3 or 2-level dict
      Topic -> Modality -> Token -> Probability or Topic -> (Modality, Token) -> Probability
      -
      +
      -Source code + +Expand source code +
      def view(
               self,
               class_ids: List[str] = None,
      @@ -1858,10 +1885,10 @@ 

      returns

      -def view_from_jupyter(self, topic_names=None, digits=5, horizontally_stack=True, one_topic_per_row=True, display_output=True, give_html=False) +def view_from_jupyter(self, topic_names: Union[str, List[str]] = None, digits: int = 5, horizontally_stack: bool = True, one_topic_per_row: bool = True, display_output: bool = True, give_html: bool = False)
      -

      TopTokensViewer method recommended for use +

      TopTokensViewer method recommended for use from jupyter notebooks

      Parameters

      @@ -1884,19 +1911,21 @@

      Parameters

      Returns

      -
      topic_html_strings : list of strings in HTML format
      +
      topic_html_strings : list of strings in HTML format
       

      Examples

      -
      >>> # model training here
      +
      >>> # model training here
       >>> # ...
       >>> viewer = TopTokensViewer(model)
       >>> information = viewer.view_from_jupyter()
       >>> # or
       >>> information = viewer.view_from_jupyter(output=False)
      -
      +
      -Source code + +Expand source code +
      def view_from_jupyter(
               self,
               topic_names: Union[str, List[str]] = None,
      @@ -2024,7 +2053,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/topic_flow_viewer.html b/docs/viewers/topic_flow_viewer.html index 805021a..17f491b 100644 --- a/docs/viewers/topic_flow_viewer.html +++ b/docs/viewers/topic_flow_viewer.html @@ -3,14 +3,14 @@ - + topicnet.viewers.topic_flow_viewer API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.topic_flow_viewer

      -Source code + +Expand source code +
      import numpy as np
       import plotly.graph_objects as go
       import artm
      @@ -179,7 +181,7 @@ 

      Classes

      (model, time_labels, dataset, modality='@lemmatized', sort_key_function=None)
      -

      Viewer to show trending topics over time.

      +

      Viewer to show trending topics over time.

      Parameters

      model : TopicModel
      @@ -192,9 +194,11 @@

      Parameters

      model's modality for topics description
      sort_key_function : Function
      function that can be used with python sorted
      -
      +
      -Source code + +Expand source code +
      class TopicFlowViewer(BaseViewer):
           """
           Viewer to show trending topics over time.
      @@ -340,14 +344,16 @@ 

      Methods

      def compute_nd(self, number_of_docs)
      -

      Compute number of tokens in each document from dataset.

      +

      Compute number of tokens in each document from dataset.

      Parameters

      number_of_docs : int
      number of documents in theta
      -
      +
      -Source code + +Expand source code +
      def compute_nd(self, number_of_docs):
           """
           Compute number of tokens in each document from dataset.
      @@ -380,13 +386,15 @@ 

      Parameters

      def compute_top_tokens(self, model, modality)
      -

      Function for top tokens extraction.

      +

      Function for top tokens extraction.

      Parameters:

      model : TopicModel modality : str -modality for topic representation

      +modality for topic representation

      -Source code + +Expand source code +
      def compute_top_tokens(self, model, modality):
           """
           Function for top tokens extraction.
      @@ -409,16 +417,18 @@ 

      Parameters:

      def plot(self, topics, significance_threshold=0.01)
      -

      Function for plotly graph building.

      +

      Function for plotly graph building.

      Parameters

      topics : list of int
      topics that need to be visualized
      significance_threshold : float
      plot ignores values lower than threshold
      -
      +
      -Source code + +Expand source code +
      def plot(self, topics, significance_threshold=1e-2):
           """
           Function for plotly graph building.
      @@ -464,13 +474,15 @@ 

      Parameters

      def view(self, topic_names=None)
      -

      Parameters

      +

      Parameters

      topic_names : list of str
      topics that user wants to see on plot
      -
      +
      -Source code + +Expand source code +
      def view(self, topic_names=None):
           """
           Parameters
      @@ -515,7 +527,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/docs/viewers/topic_mapping.html b/docs/viewers/topic_mapping.html index cb30d3c..4d14735 100644 --- a/docs/viewers/topic_mapping.html +++ b/docs/viewers/topic_mapping.html @@ -3,14 +3,14 @@ - + topicnet.viewers.topic_mapping API documentation - - + + @@ -21,7 +21,9 @@

      Module topicnet.viewers.topic_mapping

      -Source code + +Expand source code +
      import numpy as np
       from scipy import optimize
       from scipy.spatial import distance
      @@ -277,7 +279,7 @@ 

      Functions

      def compute_topic_mapping(matrix_left, matrix_right, metric='euclidean')
      -

      This function provides mapping of topics +

      This function provides mapping of topics from one model to the topics of the other model based on their simmularity defined by the metrics.

      Parameters

      @@ -297,9 +299,11 @@

      Returns

      tuple of ndarrays
      returns two ndarrays of indices, where each index corresponds to a topic from respective models
      -
      +
      -Source code + +Expand source code +
      def compute_topic_mapping(matrix_left, matrix_right, metric='euclidean'):
           """
           This function provides mapping of topics
      @@ -344,7 +348,7 @@ 

      Classes

      (model, second_model, mode='min', metric='euclidean', class_ids=None)
      -

      Performs a mapping between topics of two model +

      Performs a mapping between topics of two model matching two closest topics together based on the Hungarian algorithm.

      Parameters

      @@ -364,9 +368,11 @@

      Parameters

      name of scipy metrics used in distance computation or function that computes pairwise distance between 2 matrices (Default value = "euclidean")
      -
      +
      -Source code + +Expand source code +
      class TopicMapViewer(BaseViewer):
           def __init__(
               self,
      @@ -578,19 +584,21 @@ 

      Methods

      def view(self, class_ids=None)
      -

      Returns pairs of close topics.

      +

      Returns pairs of close topics.

      Parameters

      -
      class_ids : list of str, default - None
      +
      class_ids : list of str, default - None
      parameter for model.get_phi method

      Returns

      -
      tuple of nd.arrays of strings:
      +
      tuple of nd.arrays of strings:
      two ordered arrays of topic name pairs
      -
      +
      -Source code + +Expand source code +
      def view(self, class_ids=None):
           """
           Returns pairs of close topics.
      @@ -682,10 +690,10 @@ 

      Returns

      -def view_from_jupyter(self, display_output=True, give_html=False, **kwargs) +def view_from_jupyter(self, display_output: bool = True, give_html: bool = False, **kwargs)
      -

      TopicMapViewer method recommended for use +

      TopicMapViewer method recommended for use from jupyter notebooks returns closest pairs of models topics and visualizes their top tokens

      @@ -704,12 +712,12 @@

      Returns

      html string of the output

      Another Parameters

      -
      -
      **kwargs
      -
      kwargs are optional ~.TopTokenViewer properties
      -
      +

      kwargs +kwargs are optional ~.TopTokenViewer properties

      -Source code + +Expand source code +
      def view_from_jupyter(
               self,
               display_output: bool = True,
      @@ -817,7 +825,7 @@ 

      -

      Generated by pdoc 0.6.3.

      +

      Generated by pdoc 0.8.1.

      diff --git a/topicnet/README-rus.md b/topicnet/README-rus.md deleted file mode 100644 index 19f6b4f..0000000 --- a/topicnet/README-rus.md +++ /dev/null @@ -1,23 +0,0 @@ -## TopicNet -[English version](README.md) - -Библиотека ```topicnet``` создана для автоматизации и унификации практик тематического моделирования. Она помогает снять с пользователя заботу о рутинной составляющей моделирования, освободив время для творческого процесса подбора функционала отвечающего критериям задачи. -На данный момент библиотека представлена тремя модулями: - - -* ```cooking_machine``` - отвечает за логирование эксперимента по построению тематических моделей и связь между сущностями образующими эксперимент. - - -* ```viewers``` - помогает визуализовать информацию о полученной тематической модели в доступном для экспериментатора формате. - - -* ```tests``` - помогает проверить правильность установки библиотеки TopicNet и содержит примеры работы с кодом библиотеки. - ---- -### Описание -Важно отметить, что на каждом шаге эксперимента используется один и тот же кубик для дальнейшей настройки моделей. -Если для разным тематических моделей после очередного этапа обучения применяются разные кубики, то эксперимент разделяется на два отдельных эксперимента, которые отличаются данным и последующими этапами обучения, иными словами последовательностями применения кубиков с соответствующими им параметрам. -Объект эксперимента содержит всю необходимую информацию для воспроизведения данного эксперимента с нуля и изменяется в процессе добавления кубиков пользователя. -При использовании кубиков также изменяется информация о моделях и их описание, создаются новые модели копии основной, которые соответствуют перебираемым внутри кубика параметрам. -Описание моделей также позволяет построить их по описанию и создать скрипт их создания и обучения. -Таким образом, ключевым объектом - является кубик, а точнее их последовательность, которые взаимодействуют с объектами модель и эксперимент. \ No newline at end of file diff --git a/topicnet/bitbucket-pipelines.yml b/topicnet/bitbucket-pipelines.yml index 76adcc8..28535ed 100644 --- a/topicnet/bitbucket-pipelines.yml +++ b/topicnet/bitbucket-pipelines.yml @@ -4,7 +4,7 @@ pipelines: '**': - step: name: Lint by Flake8 - image: python:3.6.0 + image: python:3.7.0 caches: - pip script: diff --git a/topicnet/cooking_machine/config_parser.py b/topicnet/cooking_machine/config_parser.py index ecbf063..7c4fbb3 100644 --- a/topicnet/cooking_machine/config_parser.py +++ b/topicnet/cooking_machine/config_parser.py @@ -26,7 +26,7 @@ checked and which aren't quite here yet. Our process consists of three stages: -1) we check the high-level structure using `base_schema`. +1) we check the high-level structure using `BASE_SCHEMA`. The presence of each required key is ensured. After this stage we could be sure than we can create a valid model using specified parameters. @@ -45,61 +45,46 @@ def __init__(self, num_iters: int = 5) but it's a work-in-progress currently. """ # noqa: W291 -from .cubes import RegularizersModifierCube, CubeCreator + +from inspect import signature, Parameter +from typing import ( + Callable, + Type, +) + +from .cubes import ( + CubeCreator, + RegularizersModifierCube, + GreedyStrategy, + PerplexityStrategy, +) from .experiment import Experiment from .dataset import Dataset from .models import scores as tnscores from .models import TopicModel - -from .cubes import PerplexityStrategy, GreedyStrategy -from .model_constructor import init_simple_default_model, create_default_topics -from .rel_toolbox_lite import count_vocab_size, handle_regularizer +from .model_constructor import ( + create_default_topics, + init_simple_default_model, +) +from .rel_toolbox_lite import ( + count_vocab_size, + handle_regularizer, +) import artm -from inspect import signature, Parameter from strictyaml import Map, Str, Int, Seq, Float, Bool from strictyaml import Any, Optional, EmptyDict, EmptyNone, EmptyList from strictyaml import dirty_load -from typing import Type - -# TODO: use stackoverflow.com/questions/37929851/parse-numpydoc-docstring-and-access-components -# for now just hardcode most common / important types -ARTM_TYPES = { - "tau": Float(), - "topic_names": Str() | Seq(Str()) | EmptyNone(), - # TODO: handle class_ids in model and in regularizers separately - "class_ids": Str() | Seq(Str()) | EmptyNone(), - "gamma": Float() | EmptyNone(), - "seed": Int(), - "num_document_passes": Int(), - "num_processors": Int(), - "cache_theta": Bool(), - "reuse_theta": Bool(), - "theta_name": Str() -} - -element = Any() -base_schema = Map({ - 'regularizers': Seq(element), - Optional('scores'): Seq(element), - 'stages': Seq(element), - 'model': Map({ - "dataset_path": Str(), - Optional("modalities_to_use"): Seq(Str()), - Optional("modalities_weights"): Any(), - "main_modality": Str(), - }), - 'topics': Map({ - "background_topics": Seq(Str()) | Int() | EmptyList(), - "specific_topics": Seq(Str()) | Int() | EmptyList(), - }) -}) SUPPORTED_CUBES = [CubeCreator, RegularizersModifierCube] SUPPORTED_STRATEGIES = [PerplexityStrategy, GreedyStrategy] +TYPE_VALIDATORS = { + 'int': Int(), 'bool': Bool(), 'str': Str(), 'float': Float() +} + def choose_key(param): """ @@ -113,6 +98,7 @@ def choose_key(param): """ if param.default is not Parameter.empty: return Optional(param.name) + return param.name @@ -136,9 +122,79 @@ def choose_validator(param): return Str() if param.name in ARTM_TYPES: return ARTM_TYPES[param.name] + return Any() +# TODO: maybe this is cool, but do we really need this? +def build_schema_from_function(func: Callable) -> dict: + from docstring_parser import parse as docstring_parse + + func_params = signature(func).parameters + func_params_schema = dict() + + for elem in docstring_parse(func.__doc__).params: + if elem.arg_name in func_params: + key = choose_key(func_params[elem.arg_name]) + func_params_schema[key] = TYPE_VALIDATORS[elem.type_name] + + return func_params_schema + + +# TODO: use stackoverflow.com/questions/37929851/parse-numpydoc-docstring-and-access-components +# for now just hardcode most common / important types +ARTM_TYPES = { + "tau": Float(), + "topic_names": Str() | Seq(Str()) | EmptyNone(), + # TODO: handle class_ids in model and in regularizers separately + "class_ids": Str() | Seq(Str()) | EmptyNone(), + "gamma": Float() | EmptyNone(), + "seed": Int(), + "num_document_passes": Int(), + "num_processors": Int(), + "cache_theta": Bool(), + "reuse_theta": Bool(), + "theta_name": Str() +} + + +_ELEMENT = Any() + +# TODO: maybe better _DICTIONARY_FILTER_SCHEMA = build_schema_from_function(artm.Dictionary.filter) +# TODO: modalities, filter params - these all are dataset's options, not model's +# maybe make separate YML block for dataset? + +BASE_SCHEMA = Map({ + 'regularizers': Seq(_ELEMENT), + Optional('scores'): Seq(_ELEMENT), + 'stages': Seq(_ELEMENT), + 'model': Map({ + "dataset_path": Str(), + Optional("dictionary_filter_parameters"): Map({ + Optional("class_id"): Str(), + Optional("min_df"): Float(), + Optional("max_df"): Float(), + Optional("min_df_rate"): Float(), + Optional("max_df_rate"): Float(), + Optional("min_tf"): Float(), + Optional("max_tf"): Float(), + Optional("max_dictionary_size"): Float(), + Optional("recalculate_value"): Bool(), + }), + Optional("keep_in_memory"): Bool(), + Optional("internals_folder_path"): Bool(), + Optional("modalities_to_use"): Seq(Str()), + Optional("modalities_weights"): Any(), + "main_modality": Str(), + }), + 'topics': Map({ + "background_topics": Seq(Str()) | Int() | EmptyList(), + "specific_topics": Seq(Str()) | Int() | EmptyList(), + }) +}) +KEY_DICTIONARY_FILTER_PARAMETERS = 'dictionary_filter_parameters' + + def build_schema_from_signature(class_of_object, use_optional=True): """ Parameters @@ -439,7 +495,7 @@ def parse_modalities_data(parsed): # exactly one should be specified if has_modalities_to_use == has_weights: - raise ValueError(f"Either 'modalities_to_use' or 'modalities_weights' should be specified") + raise ValueError("Either 'modalities_to_use' or 'modalities_weights' should be specified") if has_weights: modalities_to_use = list(parsed["model"]["modalities_weights"].data) @@ -474,8 +530,9 @@ def parse( regularizers: list topic_model: TopicModel dataset: Dataset + """ - parsed = dirty_load(yaml_string, base_schema, allow_flow_style=True) + parsed = dirty_load(yaml_string, BASE_SCHEMA, allow_flow_style=True) specific_topic_names, background_topic_names = create_default_topics( parsed.data["topics"]["specific_topics"], @@ -484,12 +541,22 @@ def parse( revalidate_section(parsed, "stages") revalidate_section(parsed, "regularizers") + if "scores" in parsed: revalidate_section(parsed, "scores") - cube_settings = [] + dataset = dataset_class( + data_path=parsed.data["model"]["dataset_path"], + keep_in_memory=parsed.data["model"].get("keep_in_memory", True), + internals_folder_path=parsed.data["model"].get("internals_folder_path", None), + ) + filter_parameters = parsed.data["model"].get( + KEY_DICTIONARY_FILTER_PARAMETERS, dict() + ) - dataset = dataset_class(parsed.data["model"]["dataset_path"]) + if len(filter_parameters) > 0: + filtered_dictionary = dataset.get_dictionary().filter(**filter_parameters) + dataset._cached_dict = filtered_dictionary modalities_to_use = parse_modalities_data(parsed) @@ -508,11 +575,12 @@ def parse( topic_model = TopicModel(model) _add_parsed_scores(parsed, topic_model) + cube_settings = list() + for stage in parsed['stages']: for elemtype, elem_args in stage.items(): settings = build_cube_settings(elemtype.data, elem_args) - if force_separate_thread: - settings[elemtype]["separate_thread"] = False + settings[elemtype]["separate_thread"] = force_separate_thread cube_settings.append(settings) return cube_settings, regularizers, topic_model, dataset @@ -548,8 +616,12 @@ def revalidate_section(parsed, section): stage.revalidate(local_schema) -def build_experiment_environment_from_yaml_config(yaml_string, experiment_id, - save_path, force_separate_thread=False): +def build_experiment_environment_from_yaml_config( + yaml_string, + experiment_id, + save_path, + force_separate_thread=False, +): """ Wraps up parameter extraction and class instances creation from yaml formatted string @@ -572,11 +644,12 @@ def build_experiment_environment_from_yaml_config(yaml_string, experiment_id, Returns ------- - tuple experiment, dataset instances of corresponding classes from topicnet + """ settings, regs, model, dataset = parse(yaml_string, force_separate_thread) # TODO: handle dynamic addition of regularizers experiment = Experiment(experiment_id=experiment_id, save_path=save_path, topic_model=model) experiment.build(settings) + return experiment, dataset diff --git a/topicnet/cooking_machine/cubes/controller_cube.py b/topicnet/cooking_machine/cubes/controller_cube.py index 65f8d95..1e01655 100644 --- a/topicnet/cooking_machine/cubes/controller_cube.py +++ b/topicnet/cooking_machine/cubes/controller_cube.py @@ -18,12 +18,15 @@ We assume that if that metric is 'sort of decreasing', then everything is OK and we are allowed to change tau coefficient further; otherwise we revert back to the last "safe" value and stop - - 'sort of decreasing' performs best with `PerplexityScore`, and all scores which - behave like perplexity (nonnegative, and which should decrease when a model gets better). - If you want to track a different kind of score, it is recommended to use `score_controller` parameter - More formal definition of "sort of decreasing": if we divide a curve into two parts like so: + 'sort of decreasing' performs best with `PerplexityScore`, + and all scores which behave like perplexity + (nonnegative, and which should decrease when a model gets better). + If you want to track a different kind of score, + it is recommended to use `score_controller` parameter + + More formal definition of "sort of decreasing": + if we divide a curve into two parts like so: ##################################### @@ -52,13 +55,15 @@ then the right part is no higher than 5% of global minimum (you can change 5% if you like by adjusting `fraction_threshold` parameter) - If `score_to_track` is None and `score_controller` is None, then `ControllerAgent` will never stop + If `score_to_track` is None and `score_controller` is None, + then `ControllerAgent` will never stop (useful for e.g. decaying coefficients) fraction_threshold: float Threshold to control a score by 'sort of decreasing' metric score_controller: BaseScoreController Custom score controller - In case of 'sort of decreasing' is not proper to control score, you are able to create custom Score Controller + In case of 'sort of decreasing' is not proper to control score, + you are able to create custom Score Controller inherited from `BaseScoreController`. tau_converter: str or callable Notably, def-style functions and lambda functions are allowed @@ -120,13 +125,20 @@ import warnings from copy import deepcopy from dataclasses import dataclass -from typing import List, Optional +from numbers import Number +from typing import ( + Callable, + List, + Optional, + Union, +) import numexpr as ne import numpy as np from dill.source import getsource from .base_cube import BaseCube +from ..models.base_regularizer import BaseRegularizer from ..rel_toolbox_lite import count_vocab_size, handle_regularizer W_HALT_CONTROL = "Process of dynamically changing tau was stopped at {} iteration" @@ -148,6 +160,7 @@ def get_score_values(self, model): return None vals = model.scores[self.score_name] + if len(vals) == 0: return None @@ -162,8 +175,10 @@ def __call__(self, model): try: out_of_control_result = self.is_out_of_control(values) except Exception as ex: - message = (f"An error occured while controlling {self.score_name}. Message: {ex}. Score values: {values}") - raise ValueError(message) + raise ValueError( + f"An error occurred while controlling {self.score_name}!" + f" Message: {ex}. Score values: {values}" + ) if out_of_control_result.error_message is not None: warnings.warn(out_of_control_result.error_message) @@ -176,7 +191,8 @@ def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer: class PerplexityScoreController(BaseScoreController): """ - Controller is proper to control the Perplexity score. For others, please ensure for yourself. + Controller is proper to control the Perplexity score. + For others, please ensure for yourself. """ DEFAULT_FRACTION_THRESHOLD = 0.05 @@ -194,23 +210,26 @@ def is_out_of_control(self, values: List[float]): minval = values[idxmin] if minval <= 0: - err_message = f"""Score {self.score_name} has min_value = {minval} which is <= 0. - This control scheme is using to control scores acting like Perplexity. - Ensure you control the Perplexity score or write your own controller""" - raise ValueError(err_message) + raise ValueError( + f'Score "{self.score_name}" has min_value = {minval} which is <= 0.' + f' This control scheme is using to control scores acting like Perplexity.' + f' Ensure you control the Perplexity score or write your own controller!' + ) answer = (right_maxval - minval) / minval > self.fraction_threshold if answer: - message = (f"Score {self.score_name} is too high! Right max value: {right_maxval}, min value: {minval}") - return OutOfControlAnswer(answer=answer, error_message=message) + return OutOfControlAnswer( + answer=answer, + error_message=( + f"Score {self.score_name} is too high!" + f" Right max value: {right_maxval}, min value: {minval}" + ), + ) return OutOfControlAnswer(answer=answer) -class ControllerAgentException(Exception): pass - - class ControllerAgent: """ Allows to change `tau` during the `_fit` method. @@ -234,27 +253,39 @@ class ControllerAgent: See top-level docstring for details. """ - def __init__(self, reg_name, tau_converter, max_iters, score_to_track=None, fraction_threshold=None, - score_controller=None, local_dict=None): + def __init__( + self, + reg_name: str, + tau_converter: Callable or str, + max_iters: int or float, + score_to_track: Union[str, List[str], None] = None, + fraction_threshold: Union[float, List[float], None] = None, + score_controller: Union[BaseScoreController, List[BaseScoreController], None] = None, + local_dict: dict = None): """ Parameters ---------- - reg_name : str - tau_converter : callable or str - max_iters : int or float - Agent will stop changing tau after `max_iters` iterations + reg_name + tau_converter + max_iters + Agent will stop changing tau after `max_iters` iterations, `max_iters` could be `float("NaN")` and `float("inf")` values: that way agent will continue operating even outside this `RegularizationControllerCube` - score_to_track : str, list of str or None + score_to_track Name of score to track. Please, use this definition to track only scores of type PerplexityScore. In other cases we recommend you to write you own ScoreController - fraction_threshold : float, list of float of the same length as score_to_track or None + fraction_threshold Uses to define threshold to control PerplexityScore - Default value is 0.05 - score_controller : BaseScoreController, list of BaseScoreController or None - local_dict : dict + Default value is 0.05. + If `fraction_threshold` is a list, it should be of the same length, as `score_to_track`. + score_controller + Score controller or controllers. + One can use this parameter for scores other than Perplexity + (or other scores that behave like Perplexity). + This is a more flexible and customizable way to control scores. + local_dict """ if local_dict is None: local_dict = dict() @@ -262,41 +293,84 @@ def __init__(self, reg_name, tau_converter, max_iters, score_to_track=None, frac self.reg_name = reg_name self.tau_converter = tau_converter - self.score_controllers = [] - if isinstance(score_to_track, list): - if fraction_threshold is None: - controller_params = [(name, PerplexityScoreController.DEFAULT_FRACTION_THRESHOLD) for name in - score_to_track] - elif isinstance(fraction_threshold, list) and len(score_to_track) == len(fraction_threshold): - controller_params = list(zip(score_to_track, fraction_threshold)) - else: - err_message = """Length of score_to_track and fraction_threshold must be same. - Otherwise fraction_threshold must be None""" - raise ControllerAgentException(err_message) - - self.score_controllers.append( - [PerplexityScoreController(name, threshold) for (name, threshold) in controller_params]) + scores_to_track = self._validate_score_to_track(score_to_track) + fraction_thresholds = self._validate_fraction_threshold( + fraction_threshold, required_length=len(scores_to_track) + ) - elif isinstance(score_to_track, str): - self.score_controllers.append([PerplexityScoreController( - score_to_track, - fraction_threshold or PerplexityScoreController.DEFAULT_FRACTION_THRESHOLD - )]) + assert len(scores_to_track) == len(fraction_thresholds) - if isinstance(score_controller, BaseScoreController): - self.score_controllers.append(score_controller) - elif isinstance(score_controller, list): - if not all(isinstance(score, BaseScoreController) for score in score_controller): - err_message = """score_controller must be of type BaseScoreController or list of BaseScoreController""" - raise ControllerAgentException(err_message) + perplexity_like_score_controllers = [ + PerplexityScoreController(name, threshold) + for (name, threshold) in zip(scores_to_track, fraction_thresholds) + ] - self.score_controllers.extend(score_controller) + self.score_controllers = list() + self.score_controllers.extend(perplexity_like_score_controllers) + self.score_controllers.extend( + self._validate_score_controller(score_controller) + ) self.is_working = True self.local_dict = local_dict self.tau_history = [] self.max_iters = max_iters + @staticmethod + def _validate_score_to_track( + score_to_track: Union[str, List[str], None]) -> List[str]: + + if isinstance(score_to_track, list): + return score_to_track + if score_to_track is None: + return list() + if isinstance(score_to_track, str): + return [score_to_track] + + raise TypeError(f'Wrong type of `score_to_track`: "{type(score_to_track)}"!') + + @staticmethod + def _validate_fraction_threshold( + fraction_threshold: Union[float, List[float], None], + required_length: int, + ) -> List[float]: + + if fraction_threshold is None: + return [PerplexityScoreController.DEFAULT_FRACTION_THRESHOLD] * required_length + if isinstance(fraction_threshold, Number): + return [float(fraction_threshold)] * required_length + + if not isinstance(fraction_threshold, list): + raise TypeError( + f'Wrong type of `fraction_threshold`: "{type(fraction_threshold)}"!' + ) + + if len(fraction_threshold) != required_length: + raise ValueError( + f'Wrong length of `fraction_threshold`: {len(fraction_threshold)}!' + f' Expected the length to be equal to {required_length}.' + ) + + return fraction_threshold + + @staticmethod + def _validate_score_controller( + score_controller: Union[BaseScoreController, List[BaseScoreController], None] + ) -> List[BaseScoreController]: + + if score_controller is None: + return list() + + elif isinstance(score_controller, BaseScoreController): + return [score_controller] + + elif (not isinstance(score_controller, list) or not all( + isinstance(score, BaseScoreController) for score in score_controller)): + raise TypeError(f'Wrong type of `score_controller`: "{type(score_controller)}"!') + + else: + return score_controller + def _convert_tau(self): """ """ if isinstance(self.tau_converter, str): @@ -327,7 +401,7 @@ def invoke(self, model, cur_iter): Note that zero means "cube just started", not "the model is brand new" """ - current_tau = model.regularizers[self.reg_name].tau + current_tau = model.get_regularizer(self.reg_name).tau self.tau_history.append(current_tau) self.local_dict["prev_tau"] = current_tau self.local_dict["cur_iter"] = cur_iter @@ -346,9 +420,9 @@ def invoke(self, model, cur_iter): if should_stop: warnings.warn(W_HALT_CONTROL.format(len(self.tau_history))) self.is_working = False - model.regularizers[self.reg_name].tau = self._find_safe_tau() + model.get_regularizer(self.reg_name).tau = self._find_safe_tau() else: - model.regularizers[self.reg_name].tau = self._convert_tau() + model.get_regularizer(self.reg_name).tau = self._convert_tau() class RegularizationControllerCube(BaseCube): @@ -390,7 +464,9 @@ def __init__(self, num_iter: int, parameters, >> ) >> "score_to_track": None, >> "fraction_threshold": None, - >> "score_controller": [PerplexityScoreController("PerplexityScore@all", 0.1)], + >> "score_controller": [ + >> PerplexityScoreController("PerplexityScore@all", 0.1) + >> ], >> "user_value_grid": [0, 1]} reg_search : str @@ -417,9 +493,9 @@ def __init__(self, num_iter: int, parameters, separate_thread=separate_thread) self._relative = use_relative_coefficients self.data_stats = None - self.raw_parameters = parameters if isinstance(parameters, dict): parameters = [parameters] + self.raw_parameters = parameters self._convert_parameters(parameters) def _convert_parameters(self, all_parameters): @@ -480,22 +556,29 @@ def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None for (agent_blueprint_template, field_name, current_user_value) in one_model_parameter: agent_blueprint = dict(agent_blueprint_template) - if agent_blueprint["reg_name"] is None: - regularizer = agent_blueprint["regularizer"] - new_regularizer = deepcopy(regularizer) - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) - agent_blueprint["reg_name"] = new_regularizer.name - else: - if agent_blueprint['reg_name'] not in new_model.regularizers.data: + if agent_blueprint.get("reg_name") is not None: + reg_name = agent_blueprint['reg_name'] + + if reg_name not in new_model.all_regularizers: error_msg = (f"Regularizer {agent_blueprint['reg_name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif agent_blueprint.get("regularizer") is not None: + regularizer = agent_blueprint["regularizer"] + new_regularizer = deepcopy(regularizer) + if isinstance(regularizer, BaseRegularizer): + new_model.custom_regularizers[new_regularizer.name] = new_regularizer + else: # classic bigARTM regularizer, attempt to relativize it's coefficients + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) + agent_blueprint["reg_name"] = new_regularizer.name + else: + raise ValueError("Either 'reg_name' or 'regularizer' should be set") agent_blueprint['local_dict']['user_value'] = current_user_value # ControllerAgent needs only reg_name in constructor agent_blueprint.pop("regularizer") diff --git a/topicnet/cooking_machine/cubes/perplexity_strategy.py b/topicnet/cooking_machine/cubes/perplexity_strategy.py index ed90af3..48bc03d 100644 --- a/topicnet/cooking_machine/cubes/perplexity_strategy.py +++ b/topicnet/cooking_machine/cubes/perplexity_strategy.py @@ -177,7 +177,7 @@ def prepare_grid(self, other_parameters, reg_search="add"): self.grid = product(*all_coeffs_grid) self.grid_len = len(all_coeffs_grid[0]) if self.grid is None: - raise ValueError(f'Failed to initialize self.grid, check initial parameters.') + raise ValueError('Failed to initialize self.grid, check initial parameters.') def grid_visit_generator(self, other_parameters, reg_search): """ diff --git a/topicnet/cooking_machine/cubes/regularizer_cube.py b/topicnet/cooking_machine/cubes/regularizer_cube.py index 0dfa4f0..1137078 100644 --- a/topicnet/cooking_machine/cubes/regularizer_cube.py +++ b/topicnet/cooking_machine/cubes/regularizer_cube.py @@ -1,6 +1,7 @@ from .base_cube import BaseCube from ..routine import transform_complex_entity_to_dict from ..rel_toolbox_lite import count_vocab_size, handle_regularizer +from ..models.base_regularizer import BaseRegularizer from copy import deepcopy @@ -144,18 +145,28 @@ def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None regularizer_type = str(type(regularizer)) if isinstance(regularizer, dict): if regularizer['name'] in new_model.all_regularizers.keys(): + # TODO: do we actually need to deepcopy custom regularizers? new_regularizer = deepcopy(new_model.all_regularizers[regularizer['name']]) - new_regularizer._tau = params - handle_regularizer( - self._relative, - new_model, - new_regularizer, - self.data_stats, - ) + if regularizer['name'] in new_model.custom_regularizers: + new_model.custom_regularizers[regularizer['name']].tau = params + else: + # if this is classic regularizer, we attempt to relativize it's coefficients + new_regularizer._tau = params + handle_regularizer( + self._relative, + new_model, + new_regularizer, + self.data_stats, + ) else: error_msg = (f"Regularizer {regularizer['name']} does not exist. " f"Cannot be modified.") raise ValueError(error_msg) + elif isinstance(regularizer, BaseRegularizer): + # TODO: do we actually need to deepcopy here? + new_regularizer = deepcopy(regularizer) + new_regularizer.tau = params + new_model.custom_regularizers[regularizer.name] = new_regularizer elif 'Regularizer' in regularizer_type: new_regularizer = deepcopy(regularizer) new_regularizer._tau = params diff --git a/topicnet/cooking_machine/dataset.py b/topicnet/cooking_machine/dataset.py index 4d96866..30dfe90 100644 --- a/topicnet/cooking_machine/dataset.py +++ b/topicnet/cooking_machine/dataset.py @@ -3,7 +3,6 @@ import pandas as pd import shutil import sys -import tempfile import warnings from glob import glob @@ -403,6 +402,7 @@ def from_dataframe( """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) + return cls(data_path=data_path, **kwargs) def get_dataset(self): @@ -558,17 +558,16 @@ def _check_collection(self): return False, path_to_collection if self._data_hash is None: - temp_file_descriptor, temp_file_path = tempfile.mkstemp( - prefix='temp_vw__', - suffix='.txt', - dir=self._internals_folder_path + temp_file_path = os.path.join( + self._internals_folder_path, 'temp_vw.txt' ) - self.write_vw(temp_file_path) - self._data_hash = blake2bchecksum(temp_file_path) - - os.close(temp_file_descriptor) - os.remove(temp_file_path) + try: + self.write_vw(temp_file_path) + self._data_hash = blake2bchecksum(temp_file_path) + finally: + if os.path.isfile(temp_file_path): + os.remove(temp_file_path) if os.path.isfile(path_to_collection): same_collection = blake2bchecksum(path_to_collection) == self._data_hash diff --git a/topicnet/cooking_machine/experiment.py b/topicnet/cooking_machine/experiment.py index eb75bac..65484ad 100644 --- a/topicnet/cooking_machine/experiment.py +++ b/topicnet/cooking_machine/experiment.py @@ -457,7 +457,7 @@ def load(load_path): Experiment """ - from .models import TopicModel + from .models import DummyTopicModel files = os.listdir(load_path) if "params.json" not in files: @@ -472,7 +472,7 @@ def load(load_path): for model_id in experiment.models.keys(): if model_id != START: model_save_path = os.path.join(load_path, model_id) - experiment.models[model_id] = TopicModel.load( + experiment.models[model_id] = DummyTopicModel.load( model_save_path, experiment ) @@ -643,7 +643,7 @@ def select(self, query_string='', models_num=None, level=None): return [] - def run(self, dataset, verbose=False, nb_verbose=False): + def run(self, dataset, verbose=False, nb_verbose=False, restore_mode=False): # noqa C901 """ Runs defined pipeline and prints out the result. @@ -664,7 +664,22 @@ def run(self, dataset, verbose=False, nb_verbose=False): continue cube = cube_description['cube'] - cube(stage_models, dataset) + if not restore_mode: + cube(stage_models, dataset) + else: + if cube_index < self.depth - 1: + print(f"[Restoring experiment]: skipping cube {cube_index}") + continue + if cube_index == self.depth - 1: + print( + f"[Restoring experiment]: selecting models at cube number" + f"{cube_index} (some models could be lost)" + ) + if cube_index >= self.depth: + print( + f"[Restoring experiment]: applying cube number {cube_index}" + ) + cube(stage_models, dataset) # TODO: either delete this line completely # or come up with a way to restore any cube using just info about it in self.cubes @@ -803,7 +818,7 @@ def preprocess_query(self, query_string: str, level): req_equal, metric, extremum) = parse_query_string(inner_query_string) if metric is not None or extremum is not None: - warnings.warn(f'You try to optimize model parameters.') + warnings.warn('You try to optimize model parameters.') candidate_tmodels = self.get_models_by_depth(level=level) special_models = choose_best_models( @@ -850,6 +865,9 @@ def build(self, settings): try: self.cubes += [{ 'action': stage_cube.action, + # TODO: should it be 'params': cube_param instead? + # it seems that it is possible to restore failed + # experiment with load() that way..? 'params': stage_cube.get_jsonable_from_parameters(), 'cube': stage_cube }] diff --git a/topicnet/cooking_machine/models/__init__.py b/topicnet/cooking_machine/models/__init__.py index 20d5755..99cd588 100644 --- a/topicnet/cooking_machine/models/__init__.py +++ b/topicnet/cooking_machine/models/__init__.py @@ -1,6 +1,7 @@ from .base_model import BaseModel from .topic_model import TopicModel from .dummy_topic_model import DummyTopicModel + from .base_score import BaseScore from .example_score import ScoreExample from .intratext_coherence_score import IntratextCoherenceScore diff --git a/topicnet/cooking_machine/models/base_score.py b/topicnet/cooking_machine/models/base_score.py index 1e6ee04..c3b43cb 100644 --- a/topicnet/cooking_machine/models/base_score.py +++ b/topicnet/cooking_machine/models/base_score.py @@ -1,4 +1,11 @@ import dill + +from typing import ( + Any, + Callable, + Dict, +) + from . import scores as tn_scores @@ -7,21 +14,93 @@ class BaseScore: Base Class to construct custom score functions. """ - def __init__(self, name: str = None): # TODO: name should not be optional + _PRECOMPUTED_DATA_PARAMETER_NAME = 'precomputed_data' + + # TODO: name should not be optional + def __init__( + self, + name: str = None, + should_compute: Callable[[int], bool] or bool = None): """ Parameters ---------- - name: + name Name of the score - + should_compute + Function which decides whether the score should be computed + on the current fit iteration or not. + If `should_compute` is `None`, then score is going to be computed on every iteration. + At the same time, whatever function one defines, + score is always computed on the last fit iteration. + This is done for two reasons. + Firstly, so that the score is always computed at least once during `model._fit()`. + Secondly, so that `experiment.select()` works correctly. + + The parameter `should_compute` might be helpful + if the score is slow but one still needs + to get the dependence of the score on iteration + (for the described case, one may compute the score + on every even iteration or somehow else). + However, be aware that if `should_compute` is used for some model's scores, + then the scores may have different number of values in `model.scores`! + Number of score values is the number of times the scores was calculated; + first value corresponds to the first fit iteration + which passed `should_compute` etc. + + There are a couple of things also worth noting. + Fit iteration numbering starts from zero. + And every new `model._fit()` call is a new range of fit iterations. + + Examples + -------- + Scores created below are unworkable (as BaseScore has no `call` method inplemented). + These are just the examples of how one can create a score and set some of its parameters. + + Scores to be computed on every iteration: + + >>> score = BaseScore() + >>> score = BaseScore(should_compute=BaseScore.compute_always) + >>> score = BaseScore(should_compute=lambda i: True) + >>> score = BaseScore(should_compute=True) + + Scores to be computed only on the last iteration: + + >>> score = BaseScore(should_compute=BaseScore.compute_on_last) + >>> score = BaseScore(should_compute=lambda i: False) + >>> score = BaseScore(should_compute=False) + + Score to be computed only on even iterations: + + >>> score = BaseScore(should_compute=lambda i: i % 2 == 0) """ self._name = name + + if should_compute is None: + should_compute = self.compute_always + elif should_compute is True: + should_compute = self.compute_always + elif should_compute is False: + should_compute = self.compute_on_last + elif not isinstance(should_compute, type(lambda: None)): + raise TypeError(f'Unknown type of `should_compute`: {type(should_compute)}!') + else: + pass + + self._should_compute = should_compute self.value = [] if not hasattr(tn_scores, self.__class__.__name__): setattr(tn_scores, self.__class__.__name__, self.__class__) + @staticmethod + def compute_always(fit_iteration: int) -> bool: + return True + + @staticmethod + def compute_on_last(fit_iteration: int) -> bool: + return False + def __repr__(self): return f'{self.__class__.__name__}' @@ -57,7 +136,7 @@ def update(self, score): self.value.append(score) - def call(self, model): + def call(self, model, precomputed_data: Dict[str, Any] = None): """ Call to custom score function. @@ -65,6 +144,12 @@ def call(self, model): ---------- model : TopicModel a TopicNet model inherited from BaseModel + precomputed_data + Data which scores may share between each other during *one fit iteration*. + For example, if the model has several scores of the same score class, + and there is a heavy time consuming computation inside this score class, + it may be useful to perform the calculations *only once*, for one score instance, + and then make the result visible for all other scores that might need it. Returns ------- @@ -78,5 +163,44 @@ def call(self, model): and then use this logic in query in Experiment's `select()` method. If one need ARTM model for score (not TopicNet one), it is available as model._model + + When creating a custom score class, + it is recommended to use `**kwargs` in the score's `call` method, + so that all `BaseScore` optional parameters are also available + in its successor score classes. + + Examples + -------- + + Score which uses `precomputed_data`: + + >>> import time + ... + >>> class NewScore(BaseScore): + ... def __init__(self, name: str, multiplier: float): + ... super().__init__(name=name) + ... + ... self._multiplier = multiplier + ... self._heavy_value_name = 'time_consuming_value_name' + ... + ... def call(self, model, precomputed_data = None): + ... if precomputed_data is None: + ... # Parameter `precomputed_data` is optional in BaseScore + ... # So this case also should be supported + ... heavy_value = self._compute_heavy(model) + ... elif self._heavy_value_name in precomputed_data: + ... # This is going to be fast + ... heavy_value = precomputed_data[self._heavy_value_name] + ... else: + ... # This is slow (but only one such call!) + ... heavy_value = self._compute_heavy(model) + ... precomputed_data[self._heavy_value_name] = heavy_value + ... + ... return heavy_value * self._multiplier + ... + ... def _compute_heavy(self, model): + ... time.sleep(100) # just for demonstration + ... + ... return 0 """ raise NotImplementedError('Define your score here') diff --git a/topicnet/cooking_machine/models/blei_lafferty_score.py b/topicnet/cooking_machine/models/blei_lafferty_score.py index baf2393..1c6e90b 100644 --- a/topicnet/cooking_machine/models/blei_lafferty_score.py +++ b/topicnet/cooking_machine/models/blei_lafferty_score.py @@ -1,4 +1,7 @@ import numpy as np + +from typing import Callable + from .base_score import BaseScore @@ -11,7 +14,11 @@ class BleiLaffertyScore(BaseScore): to describe given topic. Summing up that score helps to estimate how well the model distinguishes between topics. The higher this score - better """ - def __init__(self, name: str = None, num_top_tokens: int = 30): + def __init__( + self, + name: str = None, + num_top_tokens: int = 30, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -22,7 +29,7 @@ def __init__(self, name: str = None, num_top_tokens: int = 30): now many tokens we consider to be """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.num_top_tokens = num_top_tokens @@ -60,7 +67,7 @@ def _compute_blei_scores(self, phi): scores = phi * multiplier return scores - def call(self, model): + def call(self, model, **kwargs): modalities = list(model.class_ids.keys()) score = 0 diff --git a/topicnet/cooking_machine/models/dummy_topic_model.py b/topicnet/cooking_machine/models/dummy_topic_model.py index b00b5d5..37d802d 100644 --- a/topicnet/cooking_machine/models/dummy_topic_model.py +++ b/topicnet/cooking_machine/models/dummy_topic_model.py @@ -371,3 +371,6 @@ def get_theta(self, *args, **kwargs): def add_cube(self, cube): raise InvalidOperationError() + + def describe_regularizers(self): + raise InvalidOperationError() diff --git a/topicnet/cooking_machine/models/example_score.py b/topicnet/cooking_machine/models/example_score.py index 35ff6a8..a19504d 100644 --- a/topicnet/cooking_machine/models/example_score.py +++ b/topicnet/cooking_machine/models/example_score.py @@ -1,4 +1,7 @@ import numpy as np + +from typing import Callable + from .base_score import BaseScore @@ -10,7 +13,11 @@ class ScoreExample(BaseScore): (the internal logic of TopicNet relies on them) """ - def __init__(self, name: str = None, token_threshold: float = 1e-3): + def __init__( + self, + name: str = None, + token_threshold: float = 1e-3, + should_compute: Callable[[int], bool] = None): """ Parameters @@ -21,11 +28,11 @@ def __init__(self, name: str = None, token_threshold: float = 1e-3): what probabilities to take as token belonging to the topic """ - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self.threshold = token_threshold - def call(self, model): + def call(self, model, **kwargs): """ Method that calculates the score diff --git a/topicnet/cooking_machine/models/intratext_coherence_score.py b/topicnet/cooking_machine/models/intratext_coherence_score.py index 7aa22df..3e68969 100644 --- a/topicnet/cooking_machine/models/intratext_coherence_score.py +++ b/topicnet/cooking_machine/models/intratext_coherence_score.py @@ -8,6 +8,7 @@ from collections import defaultdict from enum import Enum, IntEnum, auto from typing import ( + Callable, Dict, List, Optional, @@ -105,6 +106,7 @@ def __init__( # noqa: C901 self, dataset: Union[Dataset, str], name: str = None, + should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, @@ -189,7 +191,7 @@ def __init__( # noqa: C901 >>> topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iterations) """ # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff - super().__init__(name=name) + super().__init__(name=name, should_compute=should_compute) self._keep_dataset = keep_dataset @@ -363,7 +365,7 @@ def load(cls, path: str): return score - def call(self, model: BaseModel) -> float: + def call(self, model: BaseModel, **kwargs) -> float: if (self._current_iteration - self._start_fit_iteration) % self._fit_iteration_step != 0: self._current_iteration += 1 diff --git a/topicnet/cooking_machine/models/scores_wrapper.py b/topicnet/cooking_machine/models/scores_wrapper.py index fe0c7ac..4796769 100644 --- a/topicnet/cooking_machine/models/scores_wrapper.py +++ b/topicnet/cooking_machine/models/scores_wrapper.py @@ -58,9 +58,9 @@ def add(self, score: Union[BaseScore, artm.scores.BaseScore]): elif isinstance(score, BaseScore): if score._name is None: raise ValueError( - f'When using `model.scores.add(score)` method,' - f' one should specify score name parameter during score initialization.' - f' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' + 'When using `model.scores.add(score)` method,' + ' one should specify score name parameter during score initialization.' + ' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' ) self._topicnet_scores[score._name] = score diff --git a/topicnet/cooking_machine/models/thetaless_regularizer.py b/topicnet/cooking_machine/models/thetaless_regularizer.py index a1218ec..1903b1a 100644 --- a/topicnet/cooking_machine/models/thetaless_regularizer.py +++ b/topicnet/cooking_machine/models/thetaless_regularizer.py @@ -1,13 +1,161 @@ import numpy as np -from numba import jit +import os +import pandas as pd import scipy.sparse +import warnings + +from numba import jit + +import artm from .base_regularizer import BaseRegularizer +from ..dataset import Dataset + + +# TODO: move this to BigARTM +# ================================== + +FIELDS = 'token class_id token_value token_tf token_df'.split() + + +def artm_dict2df(artm_dict): + """ + :Description: converts the BigARTM dictionary of the collection + to the pandas.DataFrame. + This is approximately equivalent to the dictionary.save_text() + but has no I/O overhead + + """ + dictionary_data = artm_dict._master.get_dictionary(artm_dict._name) + dict_pandas = {field: getattr(dictionary_data, field) + for field in FIELDS} + return pd.DataFrame(dict_pandas) + +# ================================== EPS = 1e-20 +# TODO: is there a better way to do this? +def obtain_token2id(dataset: Dataset): + """ + Allows one to obtain the mapping from token to the artm.dictionary id of that token + (useful for low-level operations such as reading batches manually) + + Returns + ------- + dict: + maps (token, class_id) to integer (corresponding to the row of Phi / dictionary id) + + """ + df = artm_dict2df(dataset.get_dictionary()) + df_inverted_index = df[['token', 'class_id']].reset_index().set_index(['token', 'class_id']) + + return df_inverted_index.to_dict()['index'] + + +def dataset2sparse_matrix(dataset, modality, modalities_to_use=None): + """ + Builds a sparse matrix from batch_vectorizer linked to the Dataset + + If you need an inverse mapping: + + >>> d = sparse_n_dw_matrix.todok() # convert to dictionary of keys format + >>> dict_of_csr = dict(d.items()) + + Parameters + ---------- + dataset: Dataset + modality: str + the remaining modalities will be ignored + (their occurrences will be replaced with zeros, but they will continue to exist) + modalities_to_use: iterable + a set of modalities the underlying topic model is using (this is about topic model, + not regularizer; this parameter ensures that the shapes of n_dw matrix and actual + Phi matrix match). + + The tokens outside of this list will be discarded utterly + (the resulting matrix will have no entries corresponding to them) + + For artm.ARTM() models, you need to pass whatever is inside class_ids; + while TopicModel usually requires this to be set inside modalities_to_use. + + If you hadn't explicitly listed any modalities yet, you probably could + leave this argument as None. + + If you use a single modality, wrap it into a list (e.g.['@word']) + + Returns + ------- + n_dw_matrix: scipy.sparse.csr_matrix + The matrix of document-word occurrences. + `n_dw` is a number of the occurrences of the word `w` in the document `d` + this matrix determines the dependence between the Theta and Phi matrices + (Phi is the result of one iteration of the ARTM's EM algorihtm + with uniform theta initialization and `n_dw` matrix of the document-word occurrences) + """ # noqa: W291 + token2id = obtain_token2id(dataset) + + batch_vectorizer = dataset.get_batch_vectorizer() + + return _batch_vectorizer2sparse_matrix( + batch_vectorizer, token2id, modality, modalities_to_use + ) + + +def _batch_vectorizer2sparse_matrix(batch_vectorizer, token2id, modality, modalities_to_use=None): + """ + """ + theta_column_naming = 'id' # scipy sparse matrix doesn't support non-integer indices + matrix_row, matrix_col, matrix_data = [], [], [] + + for batch_id in range(len(batch_vectorizer._batches_list)): + batch_name = batch_vectorizer._batches_list[batch_id]._filename + batch = artm.messages.Batch() + with open(batch_name, "rb") as f: + batch.ParseFromString(f.read()) + + for item_id in range(len(batch.item)): + item = batch.item[item_id] + theta_item_id = getattr(item, theta_column_naming) + + for local_token_id, token_weight in zip(item.token_id, item.token_weight): + token_class_id = batch.class_id[local_token_id] + token = batch.token[local_token_id] + if (token, token_class_id) not in token2id: + # probably dictionary was filtered + continue + if modalities_to_use and token_class_id not in modalities_to_use: + continue + if token_class_id != modality: + # we still need these tokens, + # shapes of n_dw matrix and actual Phi matrix should be in sync. + # this will be changed to zero at the end + token_weight = np.nan + token_id = token2id[(token, token_class_id)] + matrix_row.append(theta_item_id) + matrix_col.append(token_id) + matrix_data.append(token_weight) + + sparse_n_dw_matrix = scipy.sparse.csr_matrix( + (matrix_data, (matrix_row, matrix_col)), + ) + # remove the columns whose all elements are zero + # (i.e. tokens which are of different modalities) + # and renumber index (fill any "holes") + # this is needed to be in sync with artm dictionary after filtering elements out + # (they need to have the same shape) + ind = sparse_n_dw_matrix.sum(axis=0) + nonzeros = np.ravel(ind > 0) + sparse_n_dw_matrix = sparse_n_dw_matrix[:, nonzeros] + + # re-encode values to transform NaNs to explicitly stored zeros + sparse_n_dw_matrix.data = np.nan_to_num(sparse_n_dw_matrix.data) + + return sparse_n_dw_matrix + + @jit(nopython=True) def memory_efficient_inner1d(fst_arr, fst_indices, snd_arr, snd_indices): """ @@ -92,6 +240,7 @@ def get_prob_matrix_by_counters(counters, inplace=False): # set rows where sum of row is small to uniform res[np.sum(res, axis=1) < EPS, :] = 1. res /= np.sum(res, axis=1)[:, np.newaxis] + return res @@ -113,40 +262,63 @@ def calc_A_matrix( class ThetalessRegularizer(BaseRegularizer): - def __init__(self, name, tau, n_dw_matrix): + def __init__(self, name, tau, modality, dataset: Dataset): """ - Creates a node in the graph with the given args and kwargs. + A regularizer based on a "thetaless" topic model inference + + Note: this implementation stores sparse `n_dw` matrix in memory, + so this is not particularly memory- and space-efficient for huge datasets Parameters ---------- name: str name of the regularizer tau: Number - fictive parameter it's not used, just passed to the parent conctructor - n_dw_matrix: scipy.sparse.csr_matrix - The matrix of document-word occurrences - n_dw is a number of the occurrences of the word w in the document d - this matrix determines the dependence between the Theta and Phi matrices - (Phi is the result of one iteration of the ARTM's EM algorihtm - with uniform theta initialization and n_dw matrix of the document-word occurrences) - """ + according to the math, `tau` should be set to 1 (to correctly emulate a different + inference process). But you do you, it's not like there's a regularizer + police or something. + modality: str + name of modality on which the inference should be based + dataset + will be transformed to n_dw_matrix + """ # noqa: W291 super().__init__(name, tau) - self.n_dw_matrix = n_dw_matrix + + self.modality = modality + self.modalities_to_use = None + self.n_dw_matrix = None + + self.token2id = obtain_token2id(dataset) + self._batches_path = os.path.join(dataset._internals_folder_path, "batches") + + def _initialize_matrices(self, batch_vectorizer, token2id): + self.n_dw_matrix = _batch_vectorizer2sparse_matrix( + batch_vectorizer, token2id, self.modality, self.modalities_to_use + ) self.B = scipy.sparse.csr_matrix( ( - 1. * n_dw_matrix.data / calc_docsizes(n_dw_matrix), - n_dw_matrix.indices, - n_dw_matrix.indptr + 1. * self.n_dw_matrix.data / calc_docsizes(self.n_dw_matrix), + self.n_dw_matrix.indices, + self.n_dw_matrix.indptr ), - shape=n_dw_matrix.shape + shape=self.n_dw_matrix.shape ).tocsc() - self.docptr = get_docptr(n_dw_matrix) - self.wordptr = n_dw_matrix.indices + self.docptr = get_docptr(self.n_dw_matrix) + self.wordptr = self.n_dw_matrix.indices def grad(self, pwt, nwt): phi_matrix_tr = np.array(pwt) phi_matrix = phi_matrix_tr.T phi_rev_matrix = get_prob_matrix_by_counters(phi_matrix_tr) + + if self.n_dw_matrix.shape[1] != phi_rev_matrix.shape[0]: + raise ValueError( + f"Thetaless regularizer has prepared {self.n_dw_matrix.shape} n_dw matrix," + f" but was passed {phi_rev_matrix.T.shape} Phi matrix containing different" + f" number of tokens ({self.n_dw_matrix.shape[1]} != {phi_rev_matrix.shape[0]})" + f"\n(Are modalities the same?)" + ) + theta_matrix = get_prob_matrix_by_counters( self.n_dw_matrix.dot(phi_rev_matrix) ) @@ -164,4 +336,24 @@ def grad(self, pwt, nwt): tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return n_tw.T - nwt + return self.tau * (n_tw.T - nwt) + + def attach(self, model): + """ + + Parameters + ---------- + model : ARTM model + necessary to apply master component + """ + if model.num_document_passes != 1: + warnings.warn( + f"num_document_passes is equal to {model.num_document_passes}, but it" + f" should be set to {1} to correctly emulate a thetaless inference process" + ) + + self.modalities_to_use = model.class_ids.keys() + bv = artm.BatchVectorizer(data_path=self._batches_path, data_format='batches') + self._initialize_matrices(bv, self.token2id) + + self._model = model diff --git a/topicnet/cooking_machine/models/topic_model.py b/topicnet/cooking_machine/models/topic_model.py index 3dfd57e..8e397d1 100644 --- a/topicnet/cooking_machine/models/topic_model.py +++ b/topicnet/cooking_machine/models/topic_model.py @@ -9,7 +9,6 @@ import shutil import warnings -from artm.wrapper.exceptions import ArtmException from copy import deepcopy from inspect import signature from numbers import Number @@ -18,10 +17,14 @@ Any, Dict, List, + Union, ) +from artm.wrapper.exceptions import ArtmException + from . import scores as tn_scores from .base_model import BaseModel +from .base_regularizer import BaseRegularizer from .base_score import BaseScore from .frozen_score import FrozenScore from ..cubes.controller_cube import ControllerAgent @@ -59,7 +62,7 @@ def __init__( experiment=None, callbacks: List[ControllerAgent] = None, custom_scores: Dict[str, BaseScore] = None, - custom_regularizers: Dict[str, artm.regularizers.BaseRegularizer] = None, + custom_regularizers: Dict[str, BaseRegularizer] = None, *args, **kwargs): """ Initialize stage, also used for loading previously saved experiments. @@ -176,24 +179,13 @@ def get_score_properties_and_values(score_name, score_object): return score_values - def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): - """ - - Parameters - ---------- - dataset_trainable : BatchVectorizer - Data for model fit - num_iterations : int - Amount of fit steps - custom_regularizers : dict of BaseRegularizer - Regularizers to apply to model - - """ + def _prepare_custom_regularizers(self, custom_regularizers): if custom_regularizers is None: custom_regularizers = dict() all_custom_regularizers = deepcopy(custom_regularizers) all_custom_regularizers.update(self.custom_regularizers) + base_regularizers_name, base_regularizers_tau = None, None if len(all_custom_regularizers) != 0: for regularizer in all_custom_regularizers.values(): @@ -204,7 +196,29 @@ def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): base_regularizers_tau = [regularizer.tau for regularizer in self._model.regularizers.data.values()] + return base_regularizers_name, base_regularizers_tau, all_custom_regularizers + + def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): + """ + + Parameters + ---------- + dataset_trainable : BatchVectorizer + Data for model fit + num_iterations : int + Amount of fit steps + custom_regularizers : dict of BaseRegularizer + Regularizers to apply to model + + """ + (base_regularizers_name, + base_regularizers_tau, + all_custom_regularizers) = self._prepare_custom_regularizers(custom_regularizers) + for cur_iter in range(num_iterations): + precomputed_data = dict() + iter_is_last = cur_iter == num_iterations - 1 + self._model.fit_offline(batch_vectorizer=dataset_trainable, num_collection_passes=1) @@ -216,9 +230,26 @@ def _fit(self, dataset_trainable, num_iterations, custom_regularizers=None): for name, custom_score in self.custom_scores.items(): try: - score = custom_score.call(self) + should_compute_now = iter_is_last or custom_score._should_compute(cur_iter) + + if not should_compute_now: + continue + + # TODO: this check is probably should be refined somehow... + # what if some new parameter added to BaseScore.call -> new check?.. + call_parameters = signature(custom_score.call).parameters + + # if-else instead of try-catch: to speed up + if (BaseScore._PRECOMPUTED_DATA_PARAMETER_NAME not in call_parameters + and not any(str(p).startswith('**') for p in call_parameters.values())): + + score = custom_score.call(self) + else: + score = custom_score.call(self, precomputed_data=precomputed_data) + custom_score.update(score) self._model.score_tracker[name] = custom_score + except AttributeError: # TODO: means no "call" attribute? raise AttributeError(f'Score {name} doesn\'t have a desired attribute') @@ -316,17 +347,32 @@ def save_custom_regularizers(self, model_save_path=None): model_save_path = self.model_default_save_path for regularizer_name, regularizer_object in self.custom_regularizers.items(): - try: - save_path = os.path.join(model_save_path, regularizer_name + '.rd') - with open(save_path, 'wb') as reg_f: - dill.dump(regularizer_object, reg_f) - except (TypeError, AttributeError): + # If not do this, there may be problems with pickling: + # `model` is an ARTM-C-like thing, and it may cause problems + # This is safe, because `model` appears in attach(), + # which is called before each iteration + # P.S. and the `model` itself may be needed for a regularizer inside `grad()` + regularizer_object._model = None + + managed_to_pickle = False + + for (pickler, extension) in zip([dill, pickle], ['.rd', '.rp']): + save_path = os.path.join(model_save_path, regularizer_name + extension) + try: - save_path = os.path.join(model_save_path, regularizer_name + '.rp') with open(save_path, 'wb') as reg_f: - pickle.dump(regularizer_object, reg_f) + pickler.dump(regularizer_object, reg_f) except (TypeError, AttributeError): - warnings.warn(f'Cannot save {regularizer_name} regularizer.') + if os.path.isfile(save_path): + os.remove(save_path) + else: + managed_to_pickle = True + + if managed_to_pickle: + break + + if not managed_to_pickle: + warnings.warn(f'Cannot save {regularizer_name} regularizer!') def save(self, model_save_path=None, @@ -515,6 +561,7 @@ def get_phi(self, topic_names=None, class_ids=None, model_name=None): class_ids = [class_ids] class_ids_iter = class_ids or self._model.class_ids # TODO: this workaround seems to be a correct solution to this problem + # maybe the next for-loop could be replaced with these three lines if not class_ids_iter: valid_model_name = self._model.model_pwt info = self._model.master.get_phi_info(valid_model_name) @@ -829,3 +876,25 @@ def describe_regularizers(self): columns=["model_id", "regularizer_name", "tau", "gamma", "class_ids"], data=data ) return result.set_index(["model_id", "regularizer_name"]).sort_values(by="regularizer_name") + + def get_regularizer( + self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer]: + """ + Retrieves the regularizer specified, no matter is it custom or "classic" + + Returns + ------- + regularizer + + """ + # TODO: RegularizersWrapper? + + if reg_name in self.custom_regularizers: + return self.custom_regularizers[reg_name] + elif reg_name in self._model.regularizers.data: + return self._model.regularizers.data[reg_name] + else: + raise KeyError( + f'There is no such regularizer "{reg_name}"' + f' among custom and ARTM regularizers!' + ) diff --git a/topicnet/cooking_machine/recipes/artm_baseline_pipeline.py b/topicnet/cooking_machine/recipes/artm_baseline_pipeline.py index edee710..25477df 100644 --- a/topicnet/cooking_machine/recipes/artm_baseline_pipeline.py +++ b/topicnet/cooking_machine/recipes/artm_baseline_pipeline.py @@ -1,7 +1,9 @@ from typing import List + from .recipe_wrapper import BaseRecipe from .. import Dataset + ARTM_baseline_template = ''' # This config follows a strategy described by Murat Apishev # one of the core programmers of BigARTM library in personal correspondence. @@ -40,6 +42,7 @@ num_top_tokens: 30 model: dataset_path: {dataset_path} + {dictionary_filter_parameters} modalities_to_use: {modality_list} main_modality: '{main_modality}' @@ -62,6 +65,8 @@ use_relative_coefficients: true ''' +ONE_CONFIG_INDENT = 4 * ' ' + class BaselineRecipe(BaseRecipe): """ @@ -74,6 +79,7 @@ def __init__(self): def format_recipe( self, dataset_path: str, + dictionary_filter_parameters: dict = None, modality_list: List[str] = None, topic_number: int = 20, background_topic_number: int = 1, @@ -86,11 +92,21 @@ def format_recipe( background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, modality_list=modality_list, main_modality=modality_list[0], specific_topics=specific_topics, background_topics=background_topics, ) + return self._recipe diff --git a/topicnet/cooking_machine/recipes/exploratory_search_pipeline.py b/topicnet/cooking_machine/recipes/exploratory_search_pipeline.py index f0be531..ea6372e 100644 --- a/topicnet/cooking_machine/recipes/exploratory_search_pipeline.py +++ b/topicnet/cooking_machine/recipes/exploratory_search_pipeline.py @@ -20,6 +20,10 @@ # specific_topics=specific_topics, background_topics=background_topics) # when loading the recipe to adjust for your dataset +# If you have more than one modaity you want to use, we recommend employing +# more advanced MultimodalSearchRecipe from multimodal_exploratory_search_pipeline instead + + topics: # Describes number of model topics, in the actuall article 200 topics were found to be optimal specific_topics: {{specific_topics}} diff --git a/topicnet/cooking_machine/recipes/intratext_coherence_maximization.yml b/topicnet/cooking_machine/recipes/intratext_coherence_maximization.yml index 72244bf..d134629 100644 --- a/topicnet/cooking_machine/recipes/intratext_coherence_maximization.yml +++ b/topicnet/cooking_machine/recipes/intratext_coherence_maximization.yml @@ -67,6 +67,7 @@ scores: model: dataset_path: {dataset_path} + {dictionary_filter_parameters} modalities_to_use: {modality_names} main_modality: '{main_modality}' diff --git a/topicnet/cooking_machine/recipes/intratext_coherence_pipeline.py b/topicnet/cooking_machine/recipes/intratext_coherence_pipeline.py index 2c712a3..b89afe6 100644 --- a/topicnet/cooking_machine/recipes/intratext_coherence_pipeline.py +++ b/topicnet/cooking_machine/recipes/intratext_coherence_pipeline.py @@ -6,6 +6,8 @@ from .recipe_wrapper import BaseRecipe from .. import Dataset +ONE_CONFIG_INDENT = 4 * ' ' + class IntratextCoherenceRecipe(BaseRecipe): """ @@ -35,6 +37,7 @@ def format_recipe( dataset_path: str, num_specific_topics: int, main_modality: str = None, + dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, @@ -118,10 +121,19 @@ def format_recipe( for i in range(num_specific_topics, num_specific_topics + num_background_topics) ] + if dictionary_filter_parameters is None: + dictionary_filter_parameters = dict() + + dictionary_filter_parameters_as_yml = self._format_dictionary_filter_parameters( + dictionary_filter_parameters, + indent=2 * ONE_CONFIG_INDENT, + ) + self._recipe = self.recipe_template.format( modality_names=modalities, main_modality=main_modality, dataset_path=dataset_path, + dictionary_filter_parameters=dictionary_filter_parameters_as_yml, keep_dataset_in_memory=keep_dataset_in_memory, keep_dataset=keep_dataset, documents_fraction=documents_fraction, diff --git a/topicnet/cooking_machine/recipes/multimodal_exploratory_search_pipeline.py b/topicnet/cooking_machine/recipes/multimodal_exploratory_search_pipeline.py index 6f97a3d..c0a304d 100644 --- a/topicnet/cooking_machine/recipes/multimodal_exploratory_search_pipeline.py +++ b/topicnet/cooking_machine/recipes/multimodal_exploratory_search_pipeline.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Dict from .recipe_wrapper import BaseRecipe from .. import Dataset @@ -27,7 +27,7 @@ model: dataset_path: {dataset_path} - modalities_to_use: {modality_list} + {modalities_description} main_modality: '{modality}' stages: @@ -67,7 +67,7 @@ verbose: false use_relative_coefficients: True '''.format('PerplexityScore@all < 1.01 * MINIMUM(PerplexityScore@all)' + - ' and SparsityPhiScore{modality} -> max') + ' and SparsityThetaScore -> max') # Had to change tracked score function. Is it fine? decor_phi_cube_template = ''' @@ -80,9 +80,9 @@ - {0} strategy: PerplexityStrategy strategy_params: - start_point: 0 - step: 0.02 - max_len: 20 + start_point: 0.005 + step: 0.005 + max_len: 10 tracked_score_function: PerplexityScore{{modality}} verbose: false use_relative_coefficients: True @@ -134,11 +134,38 @@ def __init__(self, order='extended_modalities'): def format_recipe( self, dataset_path: str, - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, + main_modality: str = None, topic_number: int = 20, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): + ''' + Creates a recipe for multimodal search + using basic template at the top of this file + + Parameters + ---------- + dataset_path : path to the data + main_modality : str + chosen to be main modality from modality list, if possible + if it is not specified, the function attempts to user + the first entry of `modality_list` instead + + modality_list : list of modality names to use + or a dict specifying the (relative) weight of each + topic_number: + number of the model topics + background_topic_number : + number of background topics + num_iter : + specifying number of iterations for each cube + + Returns + ------- + string specifying recipe for multimodal search + ''' + if modality_list is None: modality_list = list(Dataset(dataset_path).get_possible_modalities()) @@ -146,8 +173,13 @@ def format_recipe( background_topics = [f'bcg_{i}' for i in range( len(specific_topics), len(specific_topics) + background_topic_number)] + if main_modality is None: + if isinstance(modality_list, list): + main_modality = modality_list[0] + else: + raise TypeError("main_modality should be specified") self._make_multimodal_recipe( - modality=modality_list[0], + modality=main_modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, @@ -211,14 +243,12 @@ def _form_and_order_cubes( num_iter=iterations)) cube_templates.append(smooth_phi_cube_template.format(modality=modality, num_iter=iterations)) - cube_templates.append(sparse_theta_cube_template.format(modality=modality, - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) else: raise ValueError('That option is not availiable') if self._order == 'extended_modalities': iterations = num_iter[-1] - cube_templates.append(sparse_theta_cube_template.format(modality=modality_list[0], - num_iter=iterations)) + cube_templates.append(sparse_theta_cube_template.format(num_iter=iterations)) return ''.join(cube_templates) def _make_multimodal_recipe( @@ -227,40 +257,30 @@ def _make_multimodal_recipe( modality: str, specific_topics: List[str], background_topics: List[str], - modality_list: List[str] = None, + modality_list: List[str] or Dict = None, background_topic_number: int = 1, num_iter: Union[int, List[int]] = 20, ): - ''' - Creates a recipe for multimodal search - using basic template at the top of this file - - Parameters - ---------- - dataset_path : path to the data - modality : str - chosen to be main modality from modality list - modality_list : list of modality names to use - specific_topics : list of str - names of the model topics - background_topics : list of background topic names - num_iter : number or list of numbers - specifying number of iterations for each cube - - Returns - ------- - string specifying recipe for multimodal search - ''' - reg_forms = self._form_regularizers(modality_list) cube_forms = self._form_and_order_cubes( modality_list, num_iter=num_iter,) + if isinstance(modality_list, list): + modalities_description = f"modalities_to_use: {modality_list}" + elif isinstance(modality_list, dict): + # this line has correct whitespace count + header_string = "modalities_weights:" + # these ones should be indented one level more, so 8 spaces + data_strings = [f"'{k}': {v}" for k, v in modality_list.items()] + strings = [header_string] + data_strings + modalities_description = "\n ".join(strings) + else: + raise TypeError("modality_list should be either list or dict, not {type(modality_list}") self._recipe = self.recipe_template.format( modality=modality, dataset_path=dataset_path, specific_topics=specific_topics, background_topics=background_topics, - modality_list=modality_list, + modalities_description=modalities_description, syntesized_regularizers=reg_forms, syntesized_stages=cube_forms) diff --git a/topicnet/cooking_machine/recipes/recipe_wrapper.py b/topicnet/cooking_machine/recipes/recipe_wrapper.py index 0438f87..15d9303 100644 --- a/topicnet/cooking_machine/recipes/recipe_wrapper.py +++ b/topicnet/cooking_machine/recipes/recipe_wrapper.py @@ -1,8 +1,15 @@ -from typing import Tuple +from typing import ( + Dict, + Tuple, + Union, +) from .. import Dataset from .. import Experiment -from ..config_parser import build_experiment_environment_from_yaml_config +from ..config_parser import ( + build_experiment_environment_from_yaml_config, + KEY_DICTIONARY_FILTER_PARAMETERS, +) recipe_template_example = """ @@ -30,19 +37,18 @@ def __str__(self): def format_recipe(self, *args, **kwargs) -> str: """ - Updates self._recipe variable - with variables specific for the dataset + Updates `self._recipe` + with variables specific for the dataset. """ raise NotImplementedError( 'Method needs to be specified for the recipe template' ) - return self._recipe def build_experiment_environment( self, save_path: str, experiment_id: str = 'default_experiment_name', - force_separate_thread: bool = False + force_separate_thread: bool = False, ) -> Tuple[Experiment, Dataset]: """ Returns experiment and dataset instances @@ -51,11 +57,13 @@ def build_experiment_environment( Parameters ---------- - save_path: path to the folder to save experiment logs and models - experiment_id: name of the experiment folder - force_separate_thread: train each model in dedicated process + save_path + path to the folder to save experiment logs and models + experiment_id + name of the experiment folder + force_separate_thread + train each model in dedicated process; this feature helps to handle resources in Jupyter notebooks - """ if self._recipe is None: raise ValueError( @@ -66,5 +74,27 @@ def build_experiment_environment( self._recipe, save_path=save_path, experiment_id=experiment_id, - force_separate_thread=force_separate_thread + force_separate_thread=force_separate_thread, + ) + + @staticmethod + def _format_dictionary_filter_parameters( + parameters: Dict[Union[int, float, str, bool], Union[int, float, str, bool]], + indent: str) -> str: + + blank_dictionary = '{}' + + if len(parameters) == 0: + parameters_block = blank_dictionary + else: + parameters_block = '\n'.join([ + f'{indent}{k}: {v}' + for k, v in parameters.items() + ]) + + return ( + KEY_DICTIONARY_FILTER_PARAMETERS + + ':' + + ('\n' if parameters_block != blank_dictionary else ' ') + + parameters_block ) diff --git a/topicnet/cooking_machine/routine.py b/topicnet/cooking_machine/routine.py index 9160108..3a69035 100644 --- a/topicnet/cooking_machine/routine.py +++ b/topicnet/cooking_machine/routine.py @@ -1,11 +1,14 @@ -import numpy as np +import glob import hashlib import json +import numexpr as ne +import numpy as np +import os import re import warnings + from datetime import datetime from statistics import mean, median -import numexpr as ne W_TOO_STRICT = 'No models match criteria ' @@ -441,7 +444,7 @@ def choose_value_for_models_num_and_check( models_num = models_num_from_query if models_num is not None and int(models_num) < 0: - raise ValueError(f"Cannot return negative number of models") + raise ValueError("Cannot return negative number of models") return models_num @@ -665,3 +668,26 @@ def blake2bchecksum(file_path): break m.update(data) return m.hexdigest() + + +def load_models_from_disk(experiment_directory, base_experiment_name): + """ + Is useful for restoring failed experiment + """ + from topicnet.cooking_machine.experiment import START + from topicnet.cooking_machine.models import DummyTopicModel + + result_models = [] + + mask = f"{experiment_directory}/{base_experiment_name}_*" + msg = (f'Trying to load models from {mask}.' + f' {len(glob.glob(mask))} models found.') + print(msg) + for folder in glob.glob(mask): + model_pathes = [ + f.path for f in os.scandir(folder) + if f.is_dir() and f.name != START + ] + result_models += [DummyTopicModel.load(path) for path in model_pathes] + + return result_models diff --git a/topicnet/dataset_manager/api.py b/topicnet/dataset_manager/api.py index 519e984..9c72d40 100644 --- a/topicnet/dataset_manager/api.py +++ b/topicnet/dataset_manager/api.py @@ -17,7 +17,7 @@ from ..cooking_machine.dataset import Dataset -_SERVER_URL = 'https://93.175.29.159:8085' +_SERVER_URL = 'https://topicnet-datasets.machine-intelligence.ru' _ARCHIVE_EXTENSION = '.gz' _DEFAULT_DATASET_FILE_EXTENSION = '.csv' @@ -68,12 +68,14 @@ def load_dataset(dataset_name: str, **kwargs) -> Dataset: dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_name) try: + print(f'Checking if dataset "{dataset_name}" was already downloaded before') + saved_dataset = _init_dataset_if_downloaded(dataset_path, **kwargs) except FileNotFoundError: - pass + print(f'Dataset "{dataset_name}" not found on the machine') else: print( - f'Dataset already downloaded!' + f'Dataset is found on the machine.' f' Save path is: "{saved_dataset._data_path}"' ) @@ -87,6 +89,8 @@ def load_dataset(dataset_name: str, **kwargs) -> Dataset: print(f'Downloading the "{dataset_name}" dataset...') + save_path = None + try: with urlopen(req, data=data, context=context) as answer: total_size = int(answer.headers.get('content-length', 0)) @@ -109,7 +113,7 @@ def load_dataset(dataset_name: str, **kwargs) -> Dataset: if total_size != 0 and t.n != total_size: raise RuntimeError( - "Failed to download dataset!" + "Failed to download the dataset!" " Some data was lost during network transfer" ) @@ -122,19 +126,19 @@ def load_dataset(dataset_name: str, **kwargs) -> Dataset: return Dataset(save_path, **kwargs) except Exception as exception: - if os.path.isfile(save_path): + if save_path is not None and os.path.isfile(save_path): os.remove(save_path) raise exception finally: - if os.path.isfile(save_path + _ARCHIVE_EXTENSION): + if save_path is not None and os.path.isfile(save_path + _ARCHIVE_EXTENSION): os.remove(save_path + _ARCHIVE_EXTENSION) def _init_dataset_if_downloaded(dataset_path: str, **kwargs) -> Dataset: saved_dataset_path_candidates = [ - p for p in glob(dataset_path + '*') + p for p in glob(dataset_path + '.*') if os.path.isfile(p) and not p.endswith(_ARCHIVE_EXTENSION) ] dataset = None diff --git a/topicnet/demos/README.md b/topicnet/demos/README.md index 0369a4d..a33f51a 100644 --- a/topicnet/demos/README.md +++ b/topicnet/demos/README.md @@ -30,3 +30,4 @@ This section provides demonstrations of how to use this library in NLP tasks. ---- P.S. All the guides are supposed to contain **working** examples of the library code. If you happen to find code that is no longer works, please write about it in the library issues. +We will try to resolve it as soon as possible and plan to include fixes in the nearest releases. diff --git a/topicnet/demos/Topic-Thetaless-Regularizer.ipynb b/topicnet/demos/Topic-Thetaless-Regularizer.ipynb new file mode 100644 index 0000000..002c559 --- /dev/null +++ b/topicnet/demos/Topic-Thetaless-Regularizer.ipynb @@ -0,0 +1,1290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topic modeling without Theta distribution\n", + "\n", + "Based on unpublished (yet) research by Ilya Irhin, Victor Bulatov, Konstantin Vorontsov\n", + "\n", + "One of the interpretations of topic modeling is this:\n", + "\n", + "\\begin{equation}\n", + "L(\\Phi, \\Theta) + R(\\Phi, \\Theta) \\to \\max_{\\Phi, \\Theta},\n", + "\\end{equation}\n", + "\n", + "Once the inference is directly formulated as an optimization problem, one can ask: is it possible to reduce the number f inferred parameters? \n", + "\n", + "In the practice, word-topic matrix $(\\Phi)$ has a larger importance than document-topic matrix $(\\Theta)$; so we can replace the original optimization problem with a different one:\n", + "\n", + "\\begin{equation}\n", + "L(\\Phi, f(\\Phi) ) + R(\\Phi, f(\\Phi) ) \\to \\max_{\\Phi},\n", + "\\end{equation}\n", + "\n", + "Some math later, it turns out that the \"formally correct\" optimization algorithm for this problem is different from conventional EM algorithm, and has some surprising advantages. Technically speaking, these formulas require a different iteration process (which is implemented here: https://github.com/ilirhin/python_artm), but the formulas look similar enough to a custom $\\Phi$ regularizer (if one sets `num_document_passes=1` in BigARTM)\n", + "\n", + "This notebook serves as a proof-of-concept for this idea.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import requests\n", + "import scipy.sparse\n", + "\n", + "from numba import jit\n", + "from numpy import random\n", + "from matplotlib import pyplot as plt\n", + "\n", + "import artm\n", + "\n", + "from topicnet.cooking_machine.dataset import Dataset\n", + "from topicnet.cooking_machine.models import (\n", + " BaseScore as BaseTopicNetScore,\n", + " TopicModel\n", + ")\n", + "from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer\n", + "from topicnet.cooking_machine.models.topic_prior_regularizer import TopicPriorRegularizer\n", + "from topicnet.cooking_machine.models.thetaless_regularizer import (\n", + " ThetalessRegularizer,\n", + " dataset2sparse_matrix,\n", + ")\n", + "\n", + "from topicnet.viewers.top_tokens_viewer import TopTokensViewer\n", + "from topicnet.viewers.top_documents_viewer import TopDocumentsViewer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'@footer', '@header', '@word'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = Dataset(\n", + " \"/data/datasets/20_News_dataset/20NG_BOW.csv\",\n", + " internals_folder_path=\"./20NG_internals\",\n", + ")\n", + "\n", + "dataset.get_possible_modalities()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "MAIN_MODALITY = \"@word\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the coherence score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top-token coherence (co-occurrences are calculated using the same corpus)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_doc_occurrences(dataset, modality):\n", + " \"\"\"\n", + " :param n_dw_matrix: sparse document-word matrix, shape is D x W\n", + " :return: sparse matrix of co-occurrences\n", + "\n", + " doc_occurrences[w1, w2] = the number of the documents\n", + " where there are w1 and w2\n", + " \"\"\"\n", + " n_dw_matrix = dataset2sparse_matrix(dataset, modality, modalities_to_use=[modality])\n", + " matrix = (scipy.sparse.csc_matrix(n_dw_matrix) > 0).astype(int)\n", + " co_occurrences = matrix.T * matrix\n", + "\n", + " return co_occurrences.diagonal(), co_occurrences\n", + "\n", + "\n", + "def create_pmi_top_function(\n", + " doc_occurrences, doc_co_occurrences,\n", + " documents_number, top_sizes,\n", + " co_occurrences_smooth=1.\n", + "):\n", + " \"\"\"\n", + " :param doc_occurrences: array of doc occurrences of words\n", + " :param doc_co_occurrences: sparse matrix of doc co-occurrences of words\n", + " :param documents_number: number of the documents\n", + " :param top_sizes: list of top values to calculate top-pmi for\n", + " :param co_occurrences_smooth: constant to smooth co-occurrences in log\n", + " :return: function which takes phi and theta and returns\n", + " pair of two arrays: pmi-s of the tops and ppmi-s of the tops\n", + "\n", + " pmi[i] - pmi(top of size top_sizes[i])\n", + " ppmi[i] - ppmi(top of size top_sizes[i])\n", + "\n", + " pmi(words) = sum_{u in words, v in words, u != v}\n", + " log(\n", + " (doc_co_occurrences[u, v] * documents_number + co_occurrences_smooth)\n", + " / doc_occurrences[u] / doc_occurrences[v]\n", + " )\n", + "\n", + " ppmi(words) = sum_{u in words, v in words, u != v}\n", + " max(log(\n", + " (doc_co_occurrences[u, v] * documents_number + co_occurrences_smooth)\n", + " / doc_occurrences[u] / doc_occurrences[v]\n", + " ), 0)\n", + "\n", + " \"\"\"\n", + " def func(phi):\n", + " T, W = phi.shape\n", + " max_top_size = max(top_sizes)\n", + " pmi, ppmi = np.zeros(max_top_size), np.zeros(max_top_size)\n", + " tops = np.argpartition(phi, -max_top_size, axis=1)[:, -max_top_size:]\n", + " \n", + " for t in range(T):\n", + " top = sorted(tops[t], key=lambda w: - phi[t, w])\n", + " co_occurrences = doc_co_occurrences[top, :][:, top].todense()\n", + " occurrences = doc_occurrences[top]\n", + " values = np.log(\n", + " (co_occurrences * documents_number + co_occurrences_smooth)\n", + " / (occurrences[:, np.newaxis] * occurrences[np.newaxis, :] + co_occurrences_smooth)\n", + " )\n", + " diag = np.diag_indices(len(values))\n", + " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", + " pmi += np.array(\n", + " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", + " ).ravel()\n", + "\n", + " values[values < 0.] = 0.\n", + " ppmi += np.array(\n", + " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", + " ).ravel()\n", + " \n", + " sizes = np.arange(2, max_top_size + 1)\n", + " pmi[1:] /= (T * sizes * (sizes - 1))\n", + " ppmi[1:] /= (T * sizes * (sizes - 1))\n", + " indices = np.array(top_sizes) - 1\n", + " \n", + " return pmi[indices], ppmi[indices]\n", + "\n", + " return func" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 9.41 s, sys: 1.27 s, total: 10.7 s\n", + "Wall time: 10.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "occurences, co_occurences = calc_doc_occurrences(dataset, MAIN_MODALITY)\n", + "\n", + "calc_pmi = create_pmi_top_function(\n", + " occurences, co_occurences,\n", + " dataset.get_dataset().shape[0], [20],\n", + " co_occurrences_smooth=1e-2,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "class TopTokenCoherence(BaseTopicNetScore):\n", + " def __init__(self, func):\n", + " super().__init__()\n", + "\n", + " self.calc_pmi = func\n", + "\n", + " def call(self, model: TopicModel):\n", + " values = self.calc_pmi(model.get_phi_dense()[0].T)\n", + "\n", + " return values[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model training" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "thetaless_reg = ThetalessRegularizer(\n", + " name='thetaless', \n", + " tau=1,\n", + " dataset=dataset, \n", + " modality=MAIN_MODALITY,\n", + ")\n", + "\n", + "sparse_reg = artm.regularizers.SmoothSparsePhiRegularizer(\n", + " name='sparse_reg', \n", + " tau=-0.5,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def create_model(dataset, modality: str):\n", + " model = artm.ARTM(num_topics=30, num_document_passes=1, class_ids={modality: 1.0})\n", + " model.initialize(dataset.get_dictionary())\n", + " \n", + " model.scores.add(\n", + " artm.scores.PerplexityScore(name='perplexity', class_ids=[modality])\n", + " )\n", + " model.scores.add(\n", + " artm.scores.SparsityPhiScore(name='sparsity', class_id=modality)\n", + " )\n", + " model.scores.add(\n", + " artm.scores.TopicKernelScore(name='kernel', class_id=modality)\n", + " )\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_FIT_ITERATIONS = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4min 7s, sys: 9.59 s, total: 4min 16s\n", + "Wall time: 1min 30s\n" + ] + } + ], + "source": [ + "plsa_model = TopicModel(\n", + " artm_model=create_model(dataset, MAIN_MODALITY), \n", + " model_id='plsa_model',\n", + " custom_scores={\n", + " \"coherence_ppmi\": TopTokenCoherence(calc_pmi)\n", + " }\n", + ")\n", + "\n", + "%time plsa_model._fit(dataset.get_batch_vectorizer(), NUM_FIT_ITERATIONS)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6min 4s, sys: 37.7 s, total: 6min 42s\n", + "Wall time: 3min 18s\n" + ] + } + ], + "source": [ + "thetaless_model = TopicModel(\n", + " artm_model=create_model(dataset, MAIN_MODALITY), \n", + " model_id='thetaless_model',\n", + " custom_regularizers={\n", + " thetaless_reg.name: thetaless_reg\n", + " },\n", + " custom_scores={\n", + " \"coherence_ppmi\": TopTokenCoherence(calc_pmi)\n", + " }\n", + ")\n", + "\n", + "%time thetaless_model._fit(dataset.get_batch_vectorizer(), NUM_FIT_ITERATIONS)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4min 21s, sys: 27.5 s, total: 4min 48s\n", + "Wall time: 1min 45s\n" + ] + } + ], + "source": [ + "artm_model = create_model(dataset, MAIN_MODALITY)\n", + "artm_model.regularizers.add(sparse_reg)\n", + "\n", + "sparse_model = TopicModel(\n", + " artm_model=artm_model, \n", + " model_id='sparse_model',\n", + " custom_scores={\"coherence_ppmi\": TopTokenCoherence(calc_pmi)}\n", + ")\n", + "\n", + "%time sparse_model._fit(dataset.get_batch_vectorizer(), NUM_FIT_ITERATIONS)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 1s, sys: 1min 20s, total: 8min 21s\n", + "Wall time: 3min 49s\n" + ] + } + ], + "source": [ + "artm_model = create_model(dataset, MAIN_MODALITY)\n", + "artm_model.regularizers.add(sparse_reg)\n", + "\n", + "sparse_thetaless_model = TopicModel(\n", + " artm_model=artm_model, \n", + " model_id='sparse_thetaless_model',\n", + " custom_regularizers={\n", + " thetaless_reg.name: thetaless_reg\n", + " },\n", + " custom_scores={\n", + " \"coherence_ppmi\": TopTokenCoherence(calc_pmi)\n", + " }\n", + ")\n", + "\n", + "%time sparse_thetaless_model._fit(dataset.get_batch_vectorizer(), NUM_FIT_ITERATIONS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quality Visualizing" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_metric(view_name, value_name, title_name):\n", + " plt.figure(figsize=(14,8))\n", + "\n", + " plt.subplot(2,2,1)\n", + " values = np.array(getattr(thetaless_model.score_tracker[view_name], value_name))\n", + " med = np.median(values)\n", + " values[values > 5 * med] = np.nan\n", + " last_thetaless = values[-1]\n", + "\n", + " plt.title(f'{title_name} with Thetaless regularizer')\n", + " plt.plot(values[1:])\n", + "\n", + " plt.subplot(2,2,2)\n", + " values = np.array(getattr(plsa_model.score_tracker[view_name], value_name))\n", + " med = np.median(values)\n", + " values[values > 5 * med] = np.nan\n", + " last_plsa = values[-1]\n", + " \n", + " plt.title(f'{title_name} without any regularization')\n", + " plt.plot(values[1:])\n", + " \n", + " plt.subplot(2,2,3)\n", + " values = np.array(getattr(sparse_model.score_tracker[view_name], value_name))\n", + " med = np.median(values)\n", + " values[values > 5 * med] = np.nan\n", + " last_sparse = values[-1]\n", + "\n", + " plt.title(f'{title_name} with sparsity regularizer')\n", + " plt.plot(values[1:])\n", + " \n", + " plt.subplot(2,2,4)\n", + " values = np.array(getattr(sparse_thetaless_model.score_tracker[view_name], value_name))\n", + " med = np.median(values)\n", + " values[values > 5 * med] = np.nan\n", + " last_sparse_thetaless = values[-1]\n", + "\n", + " plt.title(f'{title_name} with both regularizers')\n", + " plt.plot(values[1:])\n", + " \n", + " print(\n", + " f'Last values:\\n\\t'\n", + " f'thetaless – {last_thetaless}\\n\\t'\n", + " f'plsa – {last_plsa}\\n\\t'\n", + " f'sparse – {last_sparse}\\n\\t'\n", + " f'sparse_thetaless – {last_sparse_thetaless}\\n\\t' \n", + " )\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 1787.9063720703125\n", + "\tplsa – 1876.0892333984375\n", + "\tsparse – 1317.7843017578125\n", + "\tsparse_thetaless – 1274.2459716796875\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='perplexity', value_name='value', title_name='Perplexity')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 0.93656986951828\n", + "\tplsa – 0.8662593960762024\n", + "\tsparse – 0.9919413924217224\n", + "\tsparse_thetaless – 0.9935382604598999\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='sparsity', value_name='value', title_name='Sparsity')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 0.7902413010597229\n", + "\tplsa – 0.7366632223129272\n", + "\tsparse – 0.8241350650787354\n", + "\tsparse_thetaless – 0.8964435458183289\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='kernel', value_name='average_contrast', title_name='Kernel Contrast')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 0.9562450051307678\n", + "\tplsa – 0.8865759372711182\n", + "\tsparse – 0.9308170080184937\n", + "\tsparse_thetaless – 0.9824467897415161\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='kernel', value_name='average_purity', title_name='Kernel purity')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 4043.699951171875\n", + "\tplsa – 4254.63330078125\n", + "\tsparse – 663.4000244140625\n", + "\tsparse_thetaless – 613.4666748046875\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='kernel', value_name='average_size', title_name='Kernel size')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last values:\n", + "\tthetaless – 1.6930015502023317\n", + "\tplsa – 1.5415081854659372\n", + "\tsparse – 1.4994938821199244\n", + "\tsparse_thetaless – 1.6034352180930274\n", + "\t\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
      " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_metric(view_name='coherence_ppmi', value_name='value', title_name='Top-20 Coherence')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Coherence evaluation on an external corpus (with Palmetto)\n", + "\n", + "Unfortunately, the service isn't available right now" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def eval_top_words(name, model, num_top_tokens=10):\n", + " ttv = TopTokensViewer(model, method='blei', num_top_tokens=num_top_tokens)\n", + " output = ttv.view()\n", + " S = \"http://palmetto.aksw.org/palmetto-webapp/service/{}?words=\".format(\"umass\")\n", + " topic_score = []\n", + "\n", + " for key, modalities in output.items():\n", + " if 'topic' in key:\n", + " words = list(output[key]['@default_class'].keys())\n", + "\n", + " print(S + \"%20\".join(words))\n", + "\n", + " result = requests.get(S + \"%20\".join(words)).text\n", + " topic_score += [float(result)]\n", + "\n", + " print(round(np.mean(topic_score), 2), round(np.median(topic_score), 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# eval_top_words('thetaless', thetaless_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# eval_top_words('plsa', plsa_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# eval_top_words('sparse', sparse_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# eval_top_words('sparse_thetaless', sparse_thetaless_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's look at topics" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def print_top_words(name, model, num_top_tokens=10):\n", + " ttv = TopTokensViewer(model, method='blei', num_top_tokens=num_top_tokens)\n", + " output = ttv.view()\n", + "\n", + " for key, modalities in output.items():\n", + " if 'topic' in key:\n", + " print(', '.join(output[key][MAIN_MODALITY].keys()))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "space, earth, launch, project, mission, nasa, satellite, station, orbit, degree\n", + "\n", + "car, light, bike, ground, engine, front, ride, mile, road, wire\n", + "\n", + "game, play, team, win, player, hit, season, lose, fan, league\n", + "\n", + "system, mail, send, list, software, computer, user, address, email, package\n", + "\n", + "find, question, many, post, read, write, name, follow, book, must\n", + "\n", + "also, high, power, large, small, low, offer, control, full, add\n", + "\n", + "key, information, chip, public, message, encryption, technology, security, clipper, algorithm\n", + "\n", + "run, work, window, thank, problem, please, help, set, appreciate, driver\n", + "\n", + "use, need, number, line, order, buy, price, sell, require, check\n", + "\n", + "drive, card, disk, mac, board, hard, mb, monitor, ms, memory\n", + "\n", + "study, drug, effect, medical, disease, patient, doctor, food, ed, health\n", + "\n", + "one, get, think, good, seem, really, put, hear, probably, much\n", + "\n", + "x, output, function, entry, widget, sun, motif, define, return, include\n", + "\n", + "believe, claim, fact, true, exist, evidence, agree, matter, argument, religion\n", + "\n", + "kill, armenian, war, armenians, turkish, attack, history, dead, jews, russian\n", + "\n", + "first, two, second, end, three, lead, hold, save, close, head\n", + "\n", + "time, new, year, come, day, call, last, old, great, still\n", + "\n", + "report, university, 1993, research, national, news, april, center, page, school\n", + "\n", + "state, government, law, gun, mr, president, country, us, house, member\n", + "\n", + "make, point, even, case, mean, keep, long, actually, consider, every\n", + "\n", + "1, 2, 0, 3, 4, 5, 6, 25, 10, 7\n", + "\n", + "say, people, right, never, life, child, man, person, live, feel\n", + "\n", + "would, know, like, could, tell, give, something, anyone, someone, anything\n", + "\n", + "8, 5, e, 7, 6, b, 9, c, 4, h\n", + "\n", + "g, w, r, q, p, v, k, u, z, n\n", + "\n", + "go, see, well, take, want, look, thing, way, try, back\n", + "\n", + "file, program, image, version, color, display, format, code, available, server\n", + "\n", + "god, jesus, church, christian, israel, bible, love, christ, sin, word\n", + "\n", + "ax, max, g9v, b8f, q, a86, 7, 145, 1d9, n\n", + "\n", + "may, also, since, part, group, change, however, problem, etc, cause\n", + "\n" + ] + } + ], + "source": [ + "print_top_words('thetaless', thetaless_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "space, launch, earth, project, mission, nasa, satellite, orbit, system, station\n", + "\n", + "car, bike, turn, power, light, ground, engine, back, front, ride\n", + "\n", + "year, game, last, win, play, lose, team, hit, player, guy\n", + "\n", + "system, mail, software, computer, list, user, pc, support, mac, package\n", + "\n", + "question, post, read, find, many, book, write, different, answer, article\n", + "\n", + "also, high, work, small, offer, large, time, several, low, interested\n", + "\n", + "key, public, information, message, chip, encryption, system, security, clipper, private\n", + "\n", + "window, run, thank, anyone, work, problem, help, please, use, get\n", + "\n", + "use, number, call, line, need, sell, price, may, order, phone\n", + "\n", + "drive, card, disk, hard, board, system, mb, speed, fast, scsi\n", + "\n", + "effect, cause, drug, study, medical, disease, patient, increase, doctor, food\n", + "\n", + "one, think, get, seem, good, really, hear, come, though, talk\n", + "\n", + "x, server, output, include, file, application, entry, widget, sun, program\n", + "\n", + "believe, people, true, exist, argument, religion, fact, matter, agree, belief\n", + "\n", + "mr, kill, people, armenian, q, armenians, president, turkish, say, attack\n", + "\n", + "first, right, two, point, second, give, keep, yes, end, pass\n", + "\n", + "go, could, new, time, get, ask, great, come, old, call\n", + "\n", + "report, university, 1993, april, national, research, page, include, school, center\n", + "\n", + "state, government, gun, law, country, control, us, member, crime, weapon\n", + "\n", + "make, even, may, case, consider, fire, less, mean, long, force\n", + "\n", + "25, db, team, play, period, hockey, goal, la, 10, 1st\n", + "\n", + "say, reason, claim, life, even, never, child, person, man, people\n", + "\n", + "would, know, like, something, tell, someone, good, bad, remember, enough\n", + "\n", + "1, 2, 0, 3, 4, 5, 6, 7, 8, 9\n", + "\n", + "w, g, r, b, c, k, v, p, u, e\n", + "\n", + "well, see, want, thing, look, try, go, make, take, let\n", + "\n", + "file, image, program, version, available, color, format, display, ftp, copy\n", + "\n", + "god, jesus, israel, word, jews, christ, bible, man, love, sin\n", + "\n", + "ax, max, q, g, r, b8f, g9v, 7, p, a86\n", + "\n", + "since, group, problem, etc, may, find, also, result, use, form\n", + "\n" + ] + } + ], + "source": [ + "print_top_words('plsa', plsa_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "space, center, science, research, launch, design, project, cost, material, mission\n", + "\n", + "get, good, back, turn, buy, need, stuff, little, bike, anybody\n", + "\n", + "year, game, last, team, play, win, player, lose, guy, next\n", + "\n", + "system, computer, software, mail, machine, user, mac, base, memory, monitor\n", + "\n", + "question, post, read, write, answer, must, article, different, message, bit\n", + "\n", + "number, also, high, small, large, several, open, case, use, type\n", + "\n", + "key, information, chip, use, group, provide, may, encryption, security, clipper\n", + "\n", + "use, work, problem, run, please, anyone, help, thank, need, set\n", + "\n", + "use, new, call, price, sell, offer, check, include, interested, box\n", + "\n", + "drive, card, hard, disk, power, speed, board, sound, driver, mb\n", + "\n", + "effect, condition, hit, soon, disease, patient, compare, mile, low, year\n", + "\n", + "think, one, seem, much, hear, though, idea, see, really, put\n", + "\n", + "x, list, send, line, copy, include, name, source, output, code\n", + "\n", + "believe, people, true, church, christian, argument, fact, israel, jews, agree\n", + "\n", + "kill, people, mr, president, armenian, child, armenians, q, attack, house\n", + "\n", + "first, second, two, name, body, save, three, third, face, mark\n", + "\n", + "go, take, come, could, time, start, leave, week, around, ago\n", + "\n", + "book, report, university, 1993, national, news, april, drug, page, turkish\n", + "\n", + "state, right, law, gun, government, people, issue, control, country, support\n", + "\n", + "point, long, yes, feel, make, less, may, never, mean, time\n", + "\n", + "10, 15, 12, 20, db, 11, 13, period, series, 30\n", + "\n", + "say, even, one, reason, right, claim, make, evidence, mind, perhaps\n", + "\n", + "would, know, like, tell, something, someone, car, really, wrong, remember\n", + "\n", + "1, 2, 0, 3, 4, 5, 6, 7, 8, 25\n", + "\n", + "g, w, r, b, p, c, q, n, v, e\n", + "\n", + "want, look, well, make, see, thing, try, way, let, like\n", + "\n", + "file, window, image, program, version, color, application, available, display, format\n", + "\n", + "god, man, word, jesus, life, world, christ, death, war, love\n", + "\n", + "ax, max, q, b8f, g9v, a86, 7, 145, 1d9, r\n", + "\n", + "find, since, part, many, cause, result, also, group, people, make\n", + "\n" + ] + } + ], + "source": [ + "print_top_words('sparse', sparse_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "space, earth, science, design, center, launch, field, project, material, mission\n", + "\n", + "get, good, back, power, buy, around, big, turn, stuff, light\n", + "\n", + "year, game, last, play, team, win, player, lose, guy, season\n", + "\n", + "system, mail, software, support, computer, machine, user, pc, standard, contact\n", + "\n", + "question, post, read, write, follow, must, ask, different, bit, note\n", + "\n", + "also, number, high, source, large, several, small, add, build, require\n", + "\n", + "key, information, chip, provide, phone, use, service, order, encryption, technology\n", + "\n", + "use, work, problem, need, run, please, help, thank, anyone, window\n", + "\n", + "new, call, include, price, etc, sell, offer, check, pay, cost\n", + "\n", + "drive, card, disk, driver, hard, speed, board, mb, fast, ms\n", + "\n", + "effect, condition, medical, soon, disease, compare, patient, mile, average, ed\n", + "\n", + "one, think, much, seem, really, still, put, hear, sure, probably\n", + "\n", + "x, send, list, line, copy, 25, output, return, section, request\n", + "\n", + "believe, people, true, fact, church, agree, argument, matter, israel, jews\n", + "\n", + "kill, mr, president, armenian, fire, child, armenians, attack, house, press\n", + "\n", + "first, two, second, lead, three, great, body, save, hit, john\n", + "\n", + "go, time, take, come, day, start, old, leave, next, week\n", + "\n", + "book, report, university, 1993, study, news, national, april, drug, page\n", + "\n", + "state, government, law, issue, gun, control, public, country, war, people\n", + "\n", + "may, point, case, mean, long, show, keep, course, actually, enough\n", + "\n", + "0, 10, 20, 15, 12, 14, 11, 16, 13, db\n", + "\n", + "say, even, right, people, never, reason, every, claim, person, live\n", + "\n", + "would, know, like, could, tell, give, something, someone, car, anything\n", + "\n", + "1, 2, 3, 4, 5, 6, 8, 7, 9, 27\n", + "\n", + "g, w, r, q, p, c, e, b, n, v\n", + "\n", + "make, see, well, want, look, thing, way, try, ca, let\n", + "\n", + "file, program, image, version, color, application, display, available, code, format\n", + "\n", + "god, man, word, life, world, jesus, christian, bible, love, christ\n", + "\n", + "ax, max, b8f, g9v, a86, q, 145, 1d9, 7, bhj\n", + "\n", + "find, many, since, part, group, however, cause, result, allow, free\n", + "\n" + ] + } + ], + "source": [ + "print_top_words('sparse_thetaless', sparse_thetaless_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Last coherence values (using different number of top tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The plots above show how the coherence (PPMI) of top 20 tokens evolves. In addition, we can look at the last PMI and PPMI values using different sizes of top-token list (5, 10, 20, 50, 100 words)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "occurences, co_occurences = calc_doc_occurrences(dataset, MAIN_MODALITY)\n", + "\n", + "calc_pmi = create_pmi_top_function(\n", + " occurences, co_occurences,\n", + " dataset.get_dataset().shape[0], [5, 10, 20, 50, 100],\n", + " co_occurrences_smooth=1e-2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.21499297, 1.41841121, 1.4773908 , 1.64972323, 1.70756978]),\n", + " array([1.31236076, 1.47163629, 1.54150819, 1.69634847, 1.80704321]))" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calc_pmi(plsa_model.get_phi_dense()[0].T)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.57331496, 1.62475971, 1.65381981, 1.86711067, 1.87356811]),\n", + " array([1.57331496, 1.62499837, 1.69300155, 1.94859555, 2.09445053]))" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calc_pmi(thetaless_model.get_phi_dense()[0].T)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.35890928, 1.35700836, 1.48383579, 1.59996205, 1.60108423]),\n", + " array([1.35890928, 1.37848896, 1.49985511, 1.65918859, 1.78920465]))" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calc_pmi(sparse_model.get_phi_dense()[0].T)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.42275237, 1.48636823, 1.59154109, 1.75142554, 1.55162446]),\n", + " array([1.42275237, 1.48636823, 1.60343522, 1.83606053, 1.9786894 ]))" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calc_pmi(sparse_thetaless_model.get_phi_dense()[0].T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Also, let's take a look at train + test together, instead of relying on train only:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We observed several interesting things.\n", + "\n", + "1) the regularizer improves coherence and sparsity without explicitly optimizing for it\n", + "\n", + "2) The regularizer moves common but uninformative words away from \"informative\" topics. Compare the 3rd topic of PLSA: \n", + "\n", + "`year, game, last, win, play, lose, team, hit, player, guy`\n", + "\n", + "with the corresponding topic of Thetaless:\n", + "\n", + "`game, play, team, win, player, hit, season, lose, fan, league`\n", + "\n", + "If we look at 20th topic, we will see the reverse trend.\n", + "\n", + "The topic of PLSA is uninformative with a few meaningful words thrown in:\n", + "\n", + "`make, even, may, case, consider, fire, less, mean, long, force`\n", + "\n", + "The corresponding topic of Thetaless consist of common parlance almost entirely, without any signal lost in noise.\n", + "\n", + "`make, point, even, case, mean, keep, long, actually, consider, every`\n", + "\n", + "It should be noted that we had not specified the usual distincton between specific and background topics, the separation here is purely emergent.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "topicnet", + "language": "python", + "name": "topicnet" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/topicnet/demos/Visualizing-Your-Model-Documents.ipynb b/topicnet/demos/Visualizing-Your-Model-Documents.ipynb index e091dd2..1a60d57 100644 --- a/topicnet/demos/Visualizing-Your-Model-Documents.ipynb +++ b/topicnet/demos/Visualizing-Your-Model-Documents.ipynb @@ -26,10 +26,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ + "import os\n", + "\n", "import colorlover as cl\n", "import plotly.graph_objs as go\n", "\n", @@ -4226,14 +4228,14 @@ " \n", " " ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -4242,7 +4244,7 @@ } ], "source": [ - "IFrame(src='topic_clusters.html', width=900, height=700)" + "IFrame(src=os.path.join('images', 'topic_clusters.html'), width=900, height=700)" ] }, { @@ -36598,9 +36600,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Environment (conda_artm10)", + "display_name": "topicnet", "language": "python", - "name": "conda_artm10" + "name": "topicnet" }, "language_info": { "codemirror_mode": { @@ -36612,7 +36614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.6.10" } }, "nbformat": 4, diff --git a/topicnet/demos/topic_clusters.html b/topicnet/demos/images/topic_clusters.html similarity index 100% rename from topicnet/demos/topic_clusters.html rename to topicnet/demos/images/topic_clusters.html diff --git a/topicnet/demos/topic_thetaless_regularizer.ipynb b/topicnet/demos/topic_thetaless_regularizer.ipynb deleted file mode 100644 index 11438f9..0000000 --- a/topicnet/demos/topic_thetaless_regularizer.ipynb +++ /dev/null @@ -1,1512 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Topic modeling without Theta distribution\n", - "\n", - "Based on unpublished (yet) research by Ilya Irhin, Victor Bulatov, Konstantin Vorontsov\n", - "\n", - "One of the interpretations of topic modeling is this:\n", - "\n", - "\\begin{equation}\n", - "L(\\Phi, \\Theta) + R(\\Phi, \\Theta) \\to \\max_{\\Phi, \\Theta},\n", - "\\end{equation}\n", - "\n", - "Once the inference is directly formulated as an optimization problem, one can ask: is it possible to reduce the number f inferred parameters? \n", - "\n", - "In the practice, word-topic matrix $(\\Phi)$ has a larger importance than document-topic matrix $(\\Theta)$; so we can replace the original optimization problem with a different one:\n", - "\n", - "\\begin{equation}\n", - "L(\\Phi, f(\\Phi) ) + R(\\Phi, f(\\Phi) ) \\to \\max_{\\Phi},\n", - "\\end{equation}\n", - "\n", - "Some math later, it turns out that the \"formally correct\" optimization algorithm for this problem is different from conventional EM algorithm, and has some surprising advantages. Technically speaking, these formulas require a different iteration process (which is implemented here: https://github.com/ilirhin/python_artm), but the formulas look similar enough to a custom $\\Phi$ regularizer (if one sets `num_document_passes=1` in BigARTM)\n", - "\n", - "This notebook serves as a proof-of-concept for this idea.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import Counter\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from numpy import random\n", - "from matplotlib import pyplot as plt\n", - "from numba import jit\n", - "import scipy.sparse\n", - "\n", - "import artm\n", - "\n", - "from topicnet.cooking_machine.dataset import Dataset\n", - "\n", - "from topicnet.cooking_machine.models import TopicModel\n", - "from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer\n", - "from topicnet.cooking_machine.models.topic_prior_regularizer import TopicPriorRegularizer\n", - "\n", - "from topicnet.viewers.top_tokens_viewer import TopTokensViewer\n", - "from topicnet.viewers.top_documents_viewer import TopDocumentsViewer" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from topicnet.cooking_machine.models.thetaless_regularizer import ThetalessRegularizer\n", - "\n", - "# TopicPriorRegularizer are used here to make Phi sparse\n", - "# usually, we use an artm.SmoothSparsePhiRegularizer for this, but then it makes the performance comparision \"unfair\"\n", - "\n", - "from topicnet.cooking_machine.models.topic_prior_regularizer import TopicPriorRegularizer\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# the definition of Thetaless regularizer\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparing the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def csv2sparse_matrix(train_df, token_to_id=None, ignore_unknown_tokens=False):\n", - " if token_to_id is None:\n", - " token_to_id = dict()\n", - " train_row, train_col, train_data = [], [], []\n", - " for row_num, r in enumerate(train_df.iterrows()):\n", - " bag_of_words = Counter()\n", - " for token, _ in ast.literal_eval(r[1].tokenized):\n", - " token_id = token_to_id.get(token)\n", - " if token_id is None:\n", - " if ignore_unknown_tokens:\n", - " continue\n", - " token_id = len(token_to_id)\n", - " token_to_id[token] = token_id\n", - " bag_of_words[token_id] += 1\n", - "\n", - " for token_id, count in bag_of_words.items():\n", - " train_row.append(row_num)\n", - " train_col.append(token_id)\n", - " train_data.append(count)\n", - "\n", - " train_n_dw_matrix = scipy.sparse.csr_matrix(\n", - " (train_data, (train_row, train_col)), \n", - " shape=(len(train_df), len(token_to_id))\n", - " )\n", - " return train_n_dw_matrix, token_to_id\n", - "\n", - "\n", - "train_df = pd.read_csv('/data_mil/datasets/20_News_dataset/train_preprocessed.csv')\n", - "train_n_dw_matrix, token_to_id = csv2sparse_matrix(train_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "test_df = pd.read_csv('/data_mil/datasets/20_News_dataset/train_preprocessed.csv')\n", - "test_n_dw_matrix, _ = csv2sparse_matrix(test_df, dict(token_to_id), True)\n", - "\n", - "vocabulary = {\n", - " token_id: token\n", - " for token, token_id in token_to_id.items()\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "train_batch_vectorizer = artm.BatchVectorizer(\n", - " data_format='bow_n_wd', \n", - " n_wd=train_n_dw_matrix.T, \n", - " vocabulary=vocabulary, \n", - " batch_size=100000\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "test_batch_vectorizer = artm.BatchVectorizer(\n", - " data_format='bow_n_wd', \n", - " n_wd=test_n_dw_matrix.T, \n", - " vocabulary=vocabulary, \n", - " batch_size=100000\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Обучение модели" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def create_model():\n", - " model = artm.ARTM(num_topics=30, num_document_passes=1)\n", - " model.scores.add(artm.scores.PerplexityScore(name='perplexity'))\n", - " model.scores.add(artm.scores.PerplexityScore(\n", - " name='test_perplexity', \n", - " dictionary=test_batch_vectorizer.dictionary\n", - " ))\n", - " model.scores.add(artm.scores.SparsityPhiScore(name='sparsity'))\n", - " model.scores.add(artm.scores.TopicKernelScore(name='kernel'))\n", - " model.initialize(train_batch_vectorizer.dictionary)\n", - " return model\n", - "\n", - "\n", - "def fit(model):\n", - " model._fit(\n", - " dataset_trainable=train_batch_vectorizer, \n", - " num_iterations=50\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "thetaless_reg = ThetalessRegularizer(\n", - " name='thetaless', \n", - " tau=1, \n", - " n_dw_matrix=train_n_dw_matrix\n", - ")\n", - "\n", - "sparse_reg = TopicPriorRegularizer(\n", - " name='sparse_reg', \n", - " tau=-0.5, \n", - " beta=1,\n", - " num_topics=30\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Top-token coherence (co-occurrences are calculated using the same corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "\n", - "def calc_doc_occurrences(n_dw_matrix):\n", - " \"\"\"\n", - " :param n_dw_matrix: sparse document-word matrix, shape is D x W\n", - " :return: sparse matrix of co-occurrences\n", - "\n", - " doc_occurrences[w1, w2] = the number of the documents\n", - " where there are w1 and w2\n", - " \"\"\"\n", - " matrix = (scipy.sparse.csc_matrix(n_dw_matrix) > 0).astype(int)\n", - " co_occurrences = matrix.T * matrix\n", - " return co_occurrences.diagonal(), co_occurrences\n", - "\n", - "\n", - "def create_pmi_top_function(\n", - " doc_occurrences, doc_co_occurrences,\n", - " documents_number, top_sizes,\n", - " co_occurrences_smooth=1.\n", - "):\n", - " \"\"\"\n", - " :param doc_occurrences: array of doc occurrences of words\n", - " :param doc_co_occurrences: sparse matrix of doc co-occurrences of words\n", - " :param documents_number: number of the documents\n", - " :param top_sizes: list of top values to calculate top-pmi for\n", - " :param co_occurrences_smooth: constant to smooth co-occurrences in log\n", - " :return: function which takes phi and theta and returns\n", - " pair of two arrays: pmi-s of the tops and ppmi-s of the tops\n", - "\n", - " pmi[i] - pmi(top of size top_sizes[i])\n", - " ppmi[i] - ppmi(top of size top_sizes[i])\n", - "\n", - " pmi(words) = sum_{u in words, v in words, u != v}\n", - " log(\n", - " (doc_co_occurrences[u, v] * documents_number + co_occurrences_smooth)\n", - " / doc_occurrences[u] / doc_occurrences[v]\n", - " )\n", - "\n", - " ppmi(words) = sum_{u in words, v in words, u != v}\n", - " max(log(\n", - " (doc_co_occurrences[u, v] * documents_number + co_occurrences_smooth)\n", - " / doc_occurrences[u] / doc_occurrences[v]\n", - " ), 0)\n", - "\n", - " \"\"\"\n", - " def func(phi):\n", - " T, W = phi.shape\n", - " max_top_size = max(top_sizes)\n", - " pmi, ppmi = np.zeros(max_top_size), np.zeros(max_top_size)\n", - " tops = np.argpartition(phi, -max_top_size, axis=1)[:, -max_top_size:]\n", - " for t in range(T):\n", - " top = sorted(tops[t], key=lambda w: - phi[t, w])\n", - " co_occurrences = doc_co_occurrences[top, :][:, top].todense()\n", - " occurrences = doc_occurrences[top]\n", - " values = np.log(\n", - " (co_occurrences * documents_number + co_occurrences_smooth)\n", - " / occurrences[:, np.newaxis]\n", - " / occurrences[np.newaxis, :]\n", - " )\n", - " diag = np.diag_indices(len(values))\n", - " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", - " pmi += np.array(\n", - " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", - " ).ravel()\n", - " values[values < 0.] = 0.\n", - " ppmi += np.array(\n", - " values.cumsum(axis=0).cumsum(axis=1)[diag] - values[diag].cumsum()\n", - " ).ravel()\n", - " sizes = np.arange(2, max_top_size + 1)\n", - " pmi[1:] /= (T * sizes * (sizes - 1))\n", - " ppmi[1:] /= (T * sizes * (sizes - 1))\n", - " indices = np.array(top_sizes) - 1\n", - " return pmi[indices], ppmi[indices]\n", - "\n", - " return func\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "occurences, co_occurences = calc_doc_occurrences(train_n_dw_matrix)\n", - "calc_pmi = create_pmi_top_function(\n", - " occurences, co_occurences,\n", - " test_n_dw_matrix.shape[0], [20],\n", - " co_occurrences_smooth=1.\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from topicnet.cooking_machine.models import BaseScore as BaseTopicNetScore, TopicModel\n", - "\n", - "\n", - "class TopTokenCoherence(BaseTopicNetScore):\n", - " def __init__(self, func):\n", - " super().__init__()\n", - "\n", - " self.calc_pmi = func\n", - "\n", - " def call(self, model: TopicModel):\n", - " values = self.calc_pmi(model.get_phi_dense()[0].T)\n", - "\n", - " return values[1]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2min, sys: 17 s, total: 2min 17s\n", - "Wall time: 1min 42s\n" - ] - } - ], - "source": [ - "plsa_model = TopicModel(\n", - " artm_model=create_model(), \n", - " model_id='plsa_model',\n", - " custom_scores={\"coherence_ppmi\": TopTokenCoherence(calc_pmi)}\n", - ")\n", - "\n", - "%time fit(plsa_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 9min 50s, sys: 1min 1s, total: 10min 52s\n", - "Wall time: 2min 47s\n" - ] - } - ], - "source": [ - "thetaless_model = TopicModel(\n", - " artm_model=create_model(), \n", - " model_id='thetaless_model',\n", - " custom_regularizers={\n", - " thetaless_reg.name: thetaless_reg\n", - " },\n", - " custom_scores={\"coherence_ppmi\": TopTokenCoherence(calc_pmi)}\n", - ")\n", - "\n", - "%time fit(thetaless_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 51s, sys: 1min 15s, total: 5min 6s\n", - "Wall time: 2min 33s\n" - ] - } - ], - "source": [ - "sparse_model = TopicModel(\n", - " artm_model=create_model(), \n", - " model_id='sparse_model',\n", - " custom_regularizers={\n", - " sparse_reg.name: sparse_reg\n", - " },\n", - " custom_scores={\"coherence_ppmi\": TopTokenCoherence(calc_pmi)}\n", - ")\n", - "\n", - "%time fit(sparse_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10min 42s, sys: 1min 56s, total: 12min 38s\n", - "Wall time: 2min 47s\n" - ] - } - ], - "source": [ - "sparse_thetaless_model = TopicModel(\n", - " artm_model=create_model(), \n", - " model_id='sparse_thetaless_model',\n", - " custom_regularizers={\n", - " sparse_reg.name: sparse_reg,\n", - " thetaless_reg.name: thetaless_reg\n", - " },\n", - " custom_scores={\"coherence_ppmi\": TopTokenCoherence(calc_pmi)}\n", - ")\n", - "\n", - "%time fit(sparse_thetaless_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quality Visualizing" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_metric(view_name, value_name, title_name):\n", - " plt.figure(figsize=(14,8))\n", - "\n", - " plt.subplot(2,2,1)\n", - " values = np.array(getattr(thetaless_model.score_tracker[view_name], value_name))\n", - " med = np.median(values)\n", - " values[values > 5 * med] = np.nan\n", - " last_thetaless = values[-1]\n", - "\n", - " plt.title(f'{title_name} with Thetaless regularizer')\n", - " plt.plot(values[1:])\n", - "\n", - " plt.subplot(2,2,2)\n", - " values = np.array(getattr(plsa_model.score_tracker[view_name], value_name))\n", - " med = np.median(values)\n", - " values[values > 5 * med] = np.nan\n", - " last_plsa = values[-1]\n", - " \n", - " plt.title(f'{title_name} without any regularization')\n", - " plt.plot(values[1:])\n", - " \n", - " plt.subplot(2,2,3)\n", - " values = np.array(getattr(sparse_model.score_tracker[view_name], value_name))\n", - " med = np.median(values)\n", - " values[values > 5 * med] = np.nan\n", - " last_sparse = values[-1]\n", - "\n", - " plt.title(f'{title_name} with sparsity regularizer')\n", - " plt.plot(values[1:])\n", - " \n", - " plt.subplot(2,2,4)\n", - " values = np.array(getattr(sparse_thetaless_model.score_tracker[view_name], value_name))\n", - " med = np.median(values)\n", - " values[values > 5 * med] = np.nan\n", - " last_sparse_thetaless = values[-1]\n", - "\n", - " plt.title(f'{title_name} with both regularizers')\n", - " plt.plot(values[1:])\n", - " \n", - " \n", - " print(\n", - " f'Last values:\\n\\t'\n", - " f'thetaless – {last_thetaless}\\n\\t'\n", - " f'plsa – {last_plsa}\\n\\t'\n", - " f'sparse – {last_sparse}\\n\\t'\n", - " f'sparse_thetaless – {last_sparse_thetaless}\\n\\t' \n", - " )\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 1227.15234375\n", - "\tplsa – 1314.651123046875\n", - "\tsparse – 957.7000732421875\n", - "\tsparse_thetaless – 895.8453979492188\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAz8AAAHiCAYAAADGanKbAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3xV9f3H8dcne5LNSoCwlSUjguBCrbNWtNWqdaDVWkeHVqv116HW2tqq3XXvUbe1WneraCkCsvcIO6wEQgIZZPH9/XEOeAkhJJBwktz38/E4j9x8z7nnfM65N+ebz/l+z/eYcw4REREREZGOLiLoAERERERERA4HJT8iIiIiIhIWlPyIiIiIiEhYUPIjIiIiIiJhQcmPiIiIiIiEBSU/IiIiIiISFpT8SIPMbJKZXd0C61loZuNbIKQW22ZL7Zu/rtVm9pWWWFcTtvW0mf3qcGzrcDvUfTOz98xsYkvGJCJtm+opaY5Dra/NrMzM+rRwTD399Ua25HqlcUp+2hH/D7fS/0PZbGZPmVlS0HE1xjk32Dk3CcDM7jSz59vLNs3sEv9Yl/nHfVfI72WHGufhTJw6Oufcmc65Z4KOQyTcqZ5qu9v0tzXezAoOx7Y6GudcknNu5aGso36975xb66+37tAjlKZS8tP+fM05lwSMBI4GftbcFZhZVItH1QE5517wT0pJwJnAht2/+2Vho61+Z8zT4uex1lqvSJhQPSXN1lY/87Yalxw8Ve7tlHNuPfAeMATAzFLM7Akz22hm683sV7ubUc3sCjP7n5n9wcyKgTtDyv5iZqVmtsTMTtnf9szs22a22My2mdkHZtbLLx9nZlvMrIf/+1FmVmJmR/i/rzazr5jZGcD/ARf6VwTnmtkFZjaz3nZuNrM3G9j+SWY2P+T3f5vZ9JDfJ5vZuQfaZsgqe/n7v8PMPjSzzOYc/3qGm9k8/zi+bGZxIXGdbWZz/GMyxcyG+eXPAT2Bt/3YbvXLXzWzTf66PjOzwfvb6P7W7c+7zf8e7DCzpbs/WzMbbWYzzGy7f1X29/tZ93gzK/DXswl4qgnbHGlms/1tvuofi1/5864ws8n1tuHMrF8D204zs3+ZWZH/ffuXmeWEzJ9kZveY2f+ACqCPhXQR8b9bZSGTM797iZkd48dd4i83vrH17u/Yi8iBqZ5qvXrKzM4xr+tciX/uOjJk3l7nVvO7FZtZov95dA85P3ZvYD++6p/Lt5vZOjO7M2Rerr/+iWa21j+uP/XndTWzCjPLCFl+lH8uj25gO3ea2Wtm9ryZbQeuMLMIM/uJma0ws61m9oqZpYe853IzW+PP+7mFtKRYve7T1kgrl3l14ef+8dtoZn81s5h6x/AGM1sOLA89rmYWevzK/H12/jJ9zexjP74tZvaCmaX68/ap90OOZ5S/THcze8vMis0s38y+U+94vWJmz/rfiYVmltfQ/skBOOc0tZMJWA18xX/dA1gI3O3//ibwCJAIdAamA9/1510B1ALfB6KA+JCym4Bo4EKgFEj33zMJuNp/fS6QDxzpv/9nwJSQuO4BPvbXOw/43n5ivhN4PmReLFAMHBlSNhv4RgP7HgdUApl+DJuADUCyv91KIONA2wzZtxXAAP+9k4B7D3DsxwMF+/lMpgPdgXRgMXCtP28kUAiMASKBif7ysfXjDFnft/19igX+CMwJmfc08KsDrRsYCKwDuvvL5gJ9/defA5f5r5OAYxrZ31rgt/464w+wzRhgDfBDvO/T14HqkHivACbX24YD+jWwbxnAN4AE/1i8CrxZ7/NbCwz2vwvRhHxf623jGmAJ0AnIBrYCZ+Fd+DnV/z1rf+sN+m9ek6b2NqF6qtXrKb+s3D+HRQO3+vse48/fc271f3+aL8+v42mgLqu37fHAUP88OQzYDJzrz8v11/+YH9dRQNXu4wO8C1wXsq4/AH/Zz3buBGr8zy7CX9+NwFQgxz/2jwAv+ssPAsqA4/DqnPv993+l/n42tK/1jvko4Bj/c8rFq7tvDFnWAR/h1evxDR3XkGVfCImxn/+5xAJZwGfAHxuKod7xjPJ//xR4EO+7NBwoAk4JOV478eqwSOA3wNSg/+bb46SWn/bnTTMrASbj/ZH82sy64HXLutE5V+6cK8Q74VwU8r4Nzrm/OOdqnXOVflkh3h9ljXPuZWAp8NUGtvld4DfOucXOuVrg13itHb38+XcCKXgV2Qbgb03ZEedcFfAycCmAea0cucC/Glh2JzADOAHIw6u8JgPH4p3AljvntjZlu76nnHPL/GPxCt5J5mD92Tm3wTlXDLwdsq7vAI8456Y55+qcd09KlR9vg5xzTzrndvjH5k7gKDNLaWDRxtZdh3fiHWRm0c651c65Ff77aoB+ZpbpnCtzzk1tZL92AXc456r849TYNndXIn/2v09v4H0fms05t9U597pzrsI5twPvn5YT6y32tHNuof99rmloPWZ2HPAr4Bzn3Ha879m7zrl3nXO7nHMf4X2nzmrOekXkgFRPtW49dSHwjnPuI/88dT9e4jCuGeveL+fcJOfcfP88OQ94kX3PwXc55yqdc3OBuXhJEMAzfHmsIoGLgeca2dznzrk3/W1V4n2OP3XOFYTUg+f7LSPnA2875yY756qBX+AlDgezjzOdc1P979pqvCSr/j7+xjlXHPJd3IeZ3QYcgXfhEudcvv+5VDnnioDfN7De/a2rB15id5tzbqdzbg7wOHBZyGKT/TqsDu+4HtXAquQAlPy0P+c651Kdc72cc9f7f5S98K7+bPSbcEvw/pA7h7xvXQPrWu+cCz1xrMFrwaivF/CnkHUXA4Z3JR3/5Ps0XteGB+qt80CeAb5lZob3B/6Kf8JryKd4V3JO8F9PwjupnOj/3hybQl5X4LWCHKz9rasXcPPu4+Yfux40fIwxs0gzu9dv7t+Od4UIvKuI9e133c65fLyrZ3cChWb2UkjXhqvwrhouMbMvzOzsRvaryK/MD7hNf6r/fWroO3dAZpZgZo/4XRu24105S7W9R8NpdN1+JfIKMNE5tywk/gvqxX8c0O1QYxaRvaieat16qjvecQDAObcL79hlN3P9DTKzMWb2id9drRS4ln3rof3F9k+8C2998FpASp1zjV0Iq/+Z9wL+EfI5Lsa7oNcFb7/3LO+cq8BrvW82MxtgXpfqTX4982v23ccD1TNn4vV2OHd3gmRmnf06d72/3ucbWO/+dAeK/Yt+u61h78+1/nGPM92T1GxKfjqGdXhX4DP9CifVOdfJORd6v0hDJ/ps/2S+W0+8K2INrf+7IetOdc7FO+emAJhZNnAH3n0hD5hZ7H7i3CcGv+WhGjge+BaNXyGqX6l8yoErlYO6KtRC1gH31DtuCc65F/cT27eACcBX8K5Q5vrlxr4aXbdz7u/OuePwKhKH130N59xy59zFeP9w/BZ4zbx+4A2pH19j29zIvt+nHiGvy/G6sXk7ZNZ1P9sEuBmv694Y51wnvM+7/nHY7+dqZvF43Wv+6Jx7r178z9WLP9E5d29T1isih0T1VMvVUxvwzu2AN0AL3vl2vV9UQcj5Fgg93zZlW38H3gJ6OOdSgIdpuB7ah3/B7BXgErxEsbFj1VA864Az632Occ67f2wjXnc4YM+5PiPkvXvVM+y93/U9hNclur9fz/wf++5jY/XMQLyk+JvOudAk6Tf++4b5672UJtZdeJ9rupklh5T15MvPVVqIkp8OwDm3EfgQ74TeybwbBvua2YGaWjsDPzCzaDO7AK+v9LsNLPcwcLvf3L/7ptUL/NeGdzXtCbxWhY3A3fvZ3mYg1/YdRetZ4K9ArXNu8r5v22MK3j/Fo4HpzrmFeBXAGLzWgeZs83B4DLjWv4pmZpZo3o2ku09sm9n7pvpkvH8OtuKdwH99MOs2s4FmdrJfue/E62deB2Bml5pZln+lsMRfV1OH2Gxsfz731/M9M4syswl4n9Nuc4HBZjbcvAEh7mxkO8l+zCXm3eh6RxPj2+1JYIlz7nf1yp8HvmZmp/utbHHm3RCb08A6RKQFqZ5q0XrqFeCrZnaKeQMJ3IxXd0zx58/Ba6mKNG9AhdBjvBnIsIa7U++WjNcCsdPMRuMlfM3xLN79WufgnXeb42HgHvtysIosvz4BeA3vHD7OvMEJ7mLvxGIOcJaZpfsX2G5sZDvJwHagzLyBL65raoBm1gmvhetnDXwXkvHuSyrxE+4f15tfv97fw0+ipgC/8eunYXjf1xeaGps0jZKfjuNyvBsAFwHb8E4S3Rp9B0wD+gNb8O6rON810B/ZOfcPvFaCl/xm3AV4fbcBfoDXHP1zvxvBlcCVZnZ8A9t71f+51cxmhZQ/h9cVodErRM65cmAWsNDv7wveP91rnNd/vCH722arc87NwLtP5q94n0k+XoWw22+An/nN+7fgVRhr8K7yLMK76fNg1h0L3Iv3uW7C++fh//x5ZwALzXtO0Z+Ai+p1bTuo/fE/j6/jnahL8K52/QuvQsbvevZL4N94I+c09s/DH/H6r2/xj8H7TYkvxEXAebb3aDzH+xXLBLxjUYR3hfHH6DwocrionmpYs+op59xSvHPsX/COy9fwhhffvb0f+mUleC0wb4a8dwnePTwr/bqnoS6E1wO/NLMdePfVvHKgmOrF9z+8e0ZnOe9+mub4E16r04f+9qfiJY74ieT3gZfwEtgdePeE7e6C+BzehbbVeIn2y41s5xa8pG4H3oW9xpatbyRegvt72/fZf3f580uBd4A36r23fr1f38V4vT42AP/Au+/2o2bEJk1gzev2Kh2FmV2BN0rOcW0glni8E9hI59zyoOORlmFm04CHnXNPBR2LiLQ/qqfaLzP7GPi7c+7xVtxGEl6C1985t6q1tiMdj654SltwHfCFKpT2zcxONO85D1FmNhFviNTmttqIiLRFqqeayMyOxmv9aE5rSlPX/TXzBsVJxBvlbj5fDg4k0iQaIUICZWar8frsnhtwKHLoBuJ1j0jCez7F+X4/fxGRdkv1VNOZ2TN4x+mH9UYtaykT8Lq3Gd6w4hc1c+Q+EXV7ExERERGR8KBubyIiIiIiEhaU/IiIiIiISFho0/f8ZGZmutzc3KDDEBEJezNnztzinMsKOo62SHWViEjwmlpPHTD5MbMeeM8f6Yo3bvujzrk/mdl9eOPIV+Pd3Hylc67Ef8/teM/7qAN+4Jz7wC8/A28M90jg8XpPVt9Hbm4uM2bMOFCIIiLSysxsTdAxtFWqq0REgtfUeqop3d5qgZudc0cCxwA3mNkg4CNgiHNuGLAMuN3f8CC8hwwOxnug4oP+U4Yjgb/hPXRsEHCxv6yIiMhBM7MeZvaJmS02s4Vm9kO//D4zW2Jm88zsH2aWGvKe280s38yWmtnpIeVn+GX5ZvaTIPZHRERazwGTH+fcRufcLP/1DmAxkO2c+9A5V+svNhXI8V9PAF5yzlX5D53KB0b7U75zbqX/FOKX/GVFREQOhS7SiYhIkzRrwAMzywVGANPqzfo28J7/OhtYFzKvwC/bX7mIiMhB00U6ERFpqiYnP2aWBLwO3Oic2x5S/lO8q24v7C5q4O2ukfL627nGzGaY2YyioqKmhiciInLYLtKprhIRaZ+alPyYWTRe4vOCc+6NkPKJwNnAJSFP2C0AeoS8PQfY0Ej5Xpxzjzrn8pxzeVlZGlhIRESa5nBdpAPVVSIi7dUBkx8zM+AJYLFz7vch5WcAtwHnOOcqQt7yFnCRmcWaWW+gPzAd+ALob2a9zSwGr7/1Wy23K/uqqq1rzdWLiEgbcTgv0omISPvVlOf8HAtcBsw3szl+2f8BfwZigY+8/IipzrlrnXMLzewVYBHelbYbnHN1AGb2PeADvKGun3TOLWzRvQmRX1jGxCenc+83hnJ8f12VExHpqJpwke7EBi7S/d3Mfg9058uLdIZ/kQ5Yj3eR7lutFXdVbR2n/v4zUuKjv5wSvnydlhBN99R4ctIS6J4aR2xUZGuFIiISNg6Y/DjnJtNwV4B3G3nPPcA9DZS/29j7WlJWUizJcVFc8+xMnr96NKN6pR+OzYqIyOHXLi/S1dQ5RvZMpbSyhpLKGjaUVrK9sobSyhpq6vbubWcGnZNjyUlLICctnn5ZSRzXP5NhOalERjRURYuISEPsy14AbU9eXp47lAfHFe2o4puPfM6WsipeuuYYBndPacHoRETCh5nNdM7lBR1HW3SodVV9zjkqa+rYWlbNhpJKCrbtniq8nyXeT+egU1wUx/bL5Pj+WRzfP5Me6QktFoeISHvS1HqqKd3e2q2s5Fiev3oMFzw0hcufmM4r146lb1ZS0GGJiIjsl5mREBNFQnoUPdITGNPAMsXl1fwvfwv/XV7E5OVbeG/BJgByMxL45tE9uOq43uomJyLSgA7d8rPbyqIyvvnI50RHRvDqtWPJSdOVMRGR5lDLz/61dMtPcznnWFFUzuTlRfx7cSGT87fQJyuRuycM4dh+mYHFJSJyODW1nmrWQ07bqz5ZSTz77TGUV9Vy6ePTKNyxM+iQREREWoSZ0a9zElcc25vnrx7DU1ceTW2d45LHp/GDF2dTuF11nojIbmGR/AAM6t6Jp64cTeGOKi5/YjolFdVBhyQiItLiThrYmQ9vOoEfntKf9xds4pQHPuWp/62itm5X0KGJiAQubJIfgFG90nj0sjxWFpVz5dNfULer7Xb5ExEROVhx0ZHcdOoAPrjpBEb0SuOutxdxzl//x5qt5UGHJiISqLBKfgCO65/JnecMZvbaEuasKwk6HBERkVbTOzORZ648mgcvGcmG0kqufOoL9XwQkbAWdskPwFlDuxJh8OnSwqBDERERaVVmxllDu/HoZXkUbKvk2udnUl2rLnAiEp7CMvlJTYhhRM80Ji0rCjoUERGRw2J073R+e/5Qpq4s5vY35tOWR3sVEWktYZn8AIwfkMW8glK2lFUFHYqIiMhhcd6IHH54Sn9en1XA3z7JDzocEZHDLnyTn4GdAfhMrT8iIhJGbvxKf84d3p37P1zG23M3BB2OiMhhFbbJz+DunchMiuFTJT8iIhJGzIzfnj+Mo3PTuPnVucxcUxx0SCIih03YJj8REcYJA7L4bFmRhrwWEZGwEhsVySOX5dEtJY7vPDuTtVsrgg5JROSwCNvkB+DEAVlsq6hhXoGGvBYRkfCSnhjDU1ccTd0ux5VPT6eiujbokEREWl1YJz8n9M8iwmDSUnV9ExGR8NMnK4kHLxnJiqJy/vjv5UGHIyLS6sI6+UlLjOGoHqka8lpERMLWsf0yuejoHjwxeRUL1pcGHY6ISKsK6+QHYPyAzswrKGGrhrwWEZEwdfuZR5KWEMPtb8zXfbAi0qEp+RmYhXPw3+Vbgg5FREQkECkJ0dzxtUHMX1/K01NWBx2OiEirCfvkZ2h2ChmJMUxaWhh0KCIiIoE5e1g3xg/M4oEPl7K+pDLocEREWkXYJz97hrxevoVdauoXEZEwZWbcPWEIzsHP31yAc6oTRaTjCfvkB7yub8Xl1czTjZ4iIhLGeqQncPNpA/h4SSHvzt8UdDgiIi1OyQ9wfP8szFDXNxERCXtXjMtlSHYn7nx7IaWVNUGHIyLSopT84D3o7aicVD3vR0REwl5UZAT3fn0YW8uq+O37S4IOR0SkRSn58Z04IIu5BSUUl1cHHYqIiEighmSn8O1je/P3aWv5YnVx0OGIiLQYJT++L4e8VuuPiIjITacOIDs1np+8Po+q2rqgwxERaRFKfnzDclJJS4hW1zcREREgMTaKe84bwoqich78ZEXQ4YiItAglP77I3UNeLyvSkNciIiLA+IGdOXd4dx6clM+yzTuCDkdE5JAp+QkxfmAWW8urWbBBQ16LiIgA/PzsQSTFRnHb6/Oo08VBEWnnlPyEOGHPkNfq+iYiIgKQkRTLL742iNlrS3h+6pqgwxEROSQHTH7MrIeZfWJmi81soZn90C9PN7OPzGy5/zPNLzcz+7OZ5ZvZPDMbGbKuif7yy81sYuvt1sHJSIplWHYKny5T8iMiIrLbucOzOWFAFr97fwnrSyqDDkdE5KA1peWnFrjZOXckcAxwg5kNAn4C/Mc51x/4j/87wJlAf3+6BngIvGQJuAMYA4wG7tidMLUlx/bLZO66EsqqaoMORUREpE0wM+45dwgO+Nk/5uOcur+JSPt0wOTHObfROTfLf70DWAxkAxOAZ/zFngHO9V9PAJ51nqlAqpl1A04HPnLOFTvntgEfAWe06N60gHF9M6nd5fRcAxERkRA90hO45bSBfLK0iLfnbQw6HBGRg9Kse37MLBcYAUwDujjnNoKXIAGd/cWygXUhbyvwy/ZX3qaM6pVGdKQxdcXWoEMRERFpUyaOy+WoHqnc9dZCtumh4CLSDjU5+TGzJOB14Ebn3PbGFm2gzDVSXn8715jZDDObUVR0+O+9iY+JZESPND5fqeRHRKQ9CKd7U4MWGWH89htDKa2s4e53FgUdjohIszUp+TGzaLzE5wXn3Bt+8Wa/Oxv+z0K/vADoEfL2HGBDI+V7cc496pzLc87lZWVlNWdfWszYvhksWF9KaWVNINsXEZFmCat7U4N2RNdOXDe+L2/MWs9nGiBIRNqZpoz2ZsATwGLn3O9DZr0F7L4qNhH4Z0j55f6VtWOAUr9b3AfAaWaW5lcmp/llbc7YvhnscjB9le77ERFp68Lt3tS24IaT+tE3K5Hb35ivAYJEpF1pSsvPscBlwMlmNsefzgLuBU41s+XAqf7vAO8CK4F84DHgegDnXDFwN/CFP/3SL2tzRvRMJTYqgikrtgQdioiINEM43JvaFsRFR3LfBUexsbSSX7+7OOhwRESaLOpACzjnJtPw/ToApzSwvANu2M+6ngSebE6AQYiNiiQvN43PNeiBiEi7Uf/eVK/jQsOLNlDW5HtT/W1dg9dljp49ezY/2A5gZM80rj6+D49+tpKzhnTjuP6ZQYckInJAzRrtLZyM7ZPBkk07KNZoNiIibd7hvDcV2sb9qW3Bj04dQJ/MRG57fZ66v4lIu6DkZz/G9vWuYE3VqG8iIm1aON6b2lZ43d+GsaG0knvfU/c3EWn7lPzsx7CcFBJiItX1TUSk7Qu7e1PbklG90rnq2N48P3UtU/J1r6yItG0HvOcnXEVHRjC6d7qe9yMi0saF472pbc3Npw3kP0sKufX1eXxw4wkkxurfCxFpm9Ty04ixfTLILyyjcPvOoEMRERFps+JjIrnv/GGsL6nkt+8vCTocEZH9UvLTiHH+fT9q/REREWlcXm463z62N89+vkZdxkWkzVLy04hB3TvRKS5KJ3EREZEmuOW0geRmJHDr63Mp1+hvItIGKflpRGSEMbp3hlp+REREmiA+xnv4acG2Su59T93fRKTtUfJzAOP6ZrBmawXrSyqDDkVERKTNOzrXG/3tualrmLxco7+JSNui5OcAxvbNAFDXNxERkSa65fSB9M1K5NbX5rJ9Z03Q4YiI7KHk5wAGdkkmPTFGyY+IiEgTxUVH8sA3h7Np+07ufntR0OGIiOyh5OcAIiKMY/qk8/mKLXiPhhAREZEDGd4jlevG9+XVmQX8Z/HmoMMREQGU/DTJ2D4ZbCjdydriiqBDERERaTd+cEp/juiazE/emM+28uqgwxERUfLTFGN3P+9HXd9ERESaLDYqkge+eRTbyqu5462FQYcjIqLkpyn6ZiWSlRzLFCU/IiIizTK4ewo/PKU/b83dwLvzNwYdjoiEOSU/TWBmjOvrPe9H9/2IiIg0z3Xj+zIsJ4WfvbmAoh1VQYcjImFMyU8Tje2TQdGOKlYUlQUdioiISLsSFRnBAxccRVlVLf/3j/m6kCgigVHy00S7n/ejrm8iIiLN179LMj8+bSAfLdrMqzMKgg5HRMKUkp8m6pmeQE5aPJ8t09OqRUREDsZVx/VmbJ8M7nx7IWu2lgcdjoiEISU/TWRmjB+YxZQVW6iqrQs6HBERkXYnIsJ44JtHERVh3PjyHGrrdgUdkoiEGSU/zTB+QGcqquuYsXpb0KGIiIi0S91T47nnvKHMXlvCXz/JDzocEQkzSn6aYVy/DGIiI5i0tDDoUERERNqtrx3VnfNGZPOXj/OZtVYXFEXk8FHy0wwJMVGM6ZPOJ0uLgg5FRESkXbtrwmC6dorjppfnUF5VG3Q4IhImlPw004kDssgvLKNgW0XQoYiIiLRbneKi+cOFw1lbXMHd/1oUdDgiEiaU/DTT+IGdAZik1h8REZFDMrp3Oted2JeXvljH+ws2BR2OiIQBJT/N1DcrkZy0eCU/IiIiLeDGrwxgSHYnbn9jHoXbdwYdjoh0cEp+mklDXouIiLScmKgI/njhCCpr6rj51bns2uWCDklEOjAlPwdBQ16LiIi0nH6dk/jF2YP57/ItPPzZiqDDEZEOTMnPQdCQ1yIiIi3r4tE9OHtYNx74cBkzVhcHHY6IdFAHTH7M7EkzKzSzBSFlw81sqpnNMbMZZjbaLzcz+7OZ5ZvZPDMbGfKeiWa23J8mts7uHB4a8lpERKRlmRm/+fpQslPj+cGLsympqA46JBHpgJrS8vM0cEa9st8BdznnhgO/8H8HOBPo70/XAA8BmFk6cAcwBhgN3GFmaYcafJA05LWIiEjLSo6L5q/fGkFRWRW3vDoP53T/j4i0rAMmP865z4D67c8O6OS/TgE2+K8nAM86z1Qg1cy6AacDHznnip1z24CP2Dehalc05LWIiEjLG5aTyu1nHsm/F2/m6Smrgw5HRDqYg73n50bgPjNbB9wP3O6XZwPrQpYr8Mv2V74PM7vG70o3o6io7SYWGvJaRESkdVx5bC5fObILv353MfMKSoIOR0Q6kINNfq4DbnLO9QBuAp7wy62BZV0j5fsWOveocy7POZeXlZV1kOG1Pg15LSIi0jrMjPsvGEZWUizff3E2O3bWBB2SiHQQB5v8TATe8F+/incfD3gtOj1ClsvB6xK3v/J2TUNei4iItI7UhBj+fPEICrZVcvsb83X/j4i0iINNfjYAJ/qvTwaW+6/fAi73R307Bih1zm0EPgBOM7M0f6CD0/yydk1DXouIiLSevNx0bj5tAP+at5EXpq0NOhwR6QCaMtT1i8DnwEAzKzCzq4DvALa6njAAACAASURBVA+Y2Vzg13gjuwG8C6wE8oHHgOsBnHPFwN3AF/70S7+sXdOQ1yIibYMey9BxXXtCX8YPzOKutxcye616WojIoWnKaG8XO+e6OeeinXM5zrknnHOTnXOjnHNHOefGOOdm+ss659wNzrm+zrmhzrkZIet50jnXz5+eas2dOpw05LWISJvwNHosQ4cUEWH88cLhdE2J4/oXZrG1rCrokESkHTvYbm/i05DXIiLB02MZOrbUhBgeumQUxeXVfP/F2dTW7Qo6JBFpp5T8HCINeS0i0ma12mMZ5PAbkp3CPecNZcqKrdz/4bKgwxGRdkrJzyHSkNciIm1Wqz2Wob08k66jOX9UDpeM6cnDn67g/QUbgw5HRNohJT8t4KSB3pDX01a2+zEcREQ6klZ7LEN7eSZdR/SLrw1ieI9Ubnl1HvmFZUGHIyLtjJKfFnBsv0wSYyJ5b8GmoEMREZEv6bEMHVBsVCQPXTqS2KgIrn1+JuVVtUGHJCLtiJKfFhAXHclJR3Tmw4WbqNulh7CJiBxueixDeOmWEs9fLh7ByqIybn1tnh6AKiJNFhV0AB3FWUO78a95G5m+qpixfTOCDkdEJKw45y7ez6xRDSzrgBv2s54ngSdbMDRpJeP6ZXLbGUfwm/eWMPSzFK49sW/QIYlIO6CWnxYyfmAWcdERugFTRETkMLnmhD6cPawbv31/CZ8sKQw6HBFpB5T8tJCEmCjGD+jMews2sUtd30RERFqdmXHf+UcxqFsnfvDSbFYUaQAEEWmckp8WdObQrhTuqGL2um1BhyIiIhIW4mMieeSyUURHRvCdZ2ewfWdN0CGJSBum5KcFnXxEZ2IiI3h3vkZ9ExEROVxy0hJ48JKRrN1awY0vzdHgQyKyX0p+WlByXDTH98/k/QWbNPKMiIjIYXRMnwzuOGcwHy8p5IEPlwYdjoi0UUp+WtiZQ7uxvqSSeQWlQYciIiISVi4d05OLR/fkwUkreHtug8+nFZEwp+SnhZ16ZBeiIox3NeqbiIjIYWVm3HXOYPJ6pfHj1+ayYL0uRIrI3pT8tLCUhGjG9VPXNxERkSDEREXw0KWjSEuI4ZpnZ1C4Y2fQIYlIG6LkpxWcOaQra7ZWsGjj9qBDERERCTtZybE8dnkexRXVfPe5meysqQs6JBFpI5T8tILTBnUhwuD9BRr1TUREJAhDslP4wzeHM3ttCbe9Pk+9MUQEUPLTKjKSYhnTO4N35+u+HxERkaCcObQbt5w2gH/O2cBfP84POhwRaQOU/LSSs4Z2ZUVROcs37wg6FBERkbB1w0n9OHd4dx74aJkuSoqIkp/WcvrgrpihB56KiIgEyMy49xvDGNkzlR+9Mof5ehSFSFhT8tNKOneKI69XGu9pyGsREZFAxUVH8shleWQkxnL1s1+wqVQjwImEKyU/reiMId1YsmkHK4vKgg5FREQkrGUlx/L4xDzKdtbynWdnUFmtEeBEwpGSn1Z0xpCuALynUd9EREQCd2S3TvzpohEs2FDKTS/PoW6XRoATCTdKflpRdmo8R/VIVdc3ERGRNuIrg7rw07OO5P2Fm/j1u4uDDkdEDjMlP63sa8O6sWD9dhZt0ANPRURE2oKrjuvNFeNyeWLyKp6cvCrocETkMFLy08rOH5VDbFQEz01dHXQoIiIigjcC3M/PHsTpg7tw9zuLeF89NETChpKfVpaaEMOE4d15c/YGSitrgg5HREREgMgI448XjmB4j1R++NIcZq7ZFnRIInIYKPk5DC4fm0tlTR2vzSwIOhQRERHxxcdE8vjleXRLiePqZ75g1ZbyoEMSkVZ2wOTHzJ40s0IzW1Cv/PtmttTMFprZ70LKbzezfH/e6SHlZ/hl+Wb2k5bdjbZtSHYKI3um8vzUNezSyDIiIiJtRkZSLE9fORoz48qnprO1rCrokESkFTWl5edp4IzQAjM7CZgADHPODQbu98sHARcBg/33PGhmkWYWCfwNOBMYBFzsLxs2Lh+by6ot5UzO3xJ0KCIiIhIiNzORxy7PY2PpTq7WM4BEOrQDJj/Ouc+A4nrF1wH3Oueq/GUK/fIJwEvOuSrn3CogHxjtT/nOuZXOuWrgJX/ZsHHm0K5kJMbw7Odrgg5FRERE6hnVK40/XTSCOetKuP6FmVTX7go6JBFpBQd7z88A4Hgzm2Zmn5rZ0X55NrAuZLkCv2x/5WEjNiqSi0b34D9LNrOuuCLocERERKSeM4Z05Z5zh/LJ0iJufnWuHoIq0gEdbPITBaQBxwA/Bl4xMwOsgWVdI+X7MLNrzGyGmc0oKio6yPDapkvG9MKAF6atDToUERERacC3xvTkJ2cewdtzN/Dzfy7AOSVAIh3JwSY/BcAbzjMd2AVk+uU9QpbLATY0Ur4P59yjzrk851xeVlbWQYbXNnVPjefUQV14+Yu17KxRf2IREZG26NoT+3L9+L78fdpafvv+0qDDEZEWdLDJz5vAyQBmNgCIAbYAbwEXmVmsmfUG+gPTgS+A/mbW28xi8AZFeOtQg2+PLh+by7aKGt6ZpweqiYiItFU/Pn0glx7Tk4c/XcGDk/KDDkdEWkjUgRYwsxeB8UCmmRUAdwBPAk/6w19XAxOd1y680MxeARYBtcANzrk6fz3fAz4AIoEnnXMLW2F/2rxxfTPom5XIs1PX8I1ROUGHIyIiIg0wM355zhB27Kzld+8vpVNcNJce0yvosETkEB0w+XHOXbyfWZfuZ/l7gHsaKH8XeLdZ0XVAZsblY3O5462FzF1XwlE9UoMOSURERBoQEWHcf8FRlO2s5ef/XEByXBQThofVeE0iHc7BdnuTQ/D1kdkkxkRq2GsREZE2Ljoygr9dMpLRuen86JW5/Gteg7csi0g7oeQnAMlx0Zw3Mpu3522guLw66HBERESkEXHRkTxxxdGM6pnGD16czT9mFwQdkogcJCU/Abl8bC7Vtbt4+Yt1B15YREREApUUG8XT3z6aMb0z+NErc3l1hupvkfZIyU9ABnRJ5rh+mTz62QpKKtT6IyJysMzsSTMr9AfhCS3/vpktNbOFZva7kPLbzSzfn3d6SPkZflm+mf3kcO6DtA8JMVE8ecXRHNcvkx+/No+/67l9Iu2Okp8A/fSrR1JaWcMfPloWdCgiIu3Z08AZoQVmdhIwARjmnBsM3O+XD8J73MJg/z0PmlmkmUUCfwPOBAYBF/vLiuwlPiaSxy7P46SBWfzfP+bz7Oergw5JRJpByU+AjuzWiUuP6cVzU9ewZNP2oMMREWmXnHOfAcX1iq8D7nXOVfnLFPrlE4CXnHNVzrlVQD4w2p/ynXMrnXPVwEv+siL7iIuO5OHLRnHqoC784p8Lefy/K4MOSUSaSMlPwH506gBS4qO5458L8R6VJCIiLWAAcLyZTTOzT83saL88Gwi9WaPAL9tfuUiDYqMiefCSkXx1aDd+9c5i/vZJvupxkXZAyU/AUhNiuOX0gUxbVcw78zcGHY6ISEcRBaQBxwA/Bl4xMwOsgWVdI+UNMrNrzGyGmc0oKipqiXilHYqOjOBPFw3nvBHZ3PfBUn725gJq63YFHZaINELJTxtw0dE9GdStE/e8s5iK6tqgwxER6QgKgDecZzqwC8j0y3uELJcDbGikvEHOuUedc3nOubysrKwWD17aj6jICB644CiuG9+XF6at5ZrnZlJepbpcpK1S8tMGREYYd00YzMbSnTw0aUXQ4YiIdARvAicDmNkAIAbYArwFXGRmsWbWG+gPTAe+APqbWW8zi8EbFOGtQCKXdiciwrjtjCO457whTFpayIWPfk7h9p1BhyUiDVDy00YcnZvOucO788hnK1m7tSLocERE2g0zexH4HBhoZgVmdhXwJNDHH/76JWCi3wq0EHgFWAS8D9zgnKtzztUC3wM+ABYDr/jLijTZJWN68cTEo1lZVM55D05h2eYdQYckIvVYW745Ly8vz82YMSPoMA6bTaU7OfmBSRzbL5PHLs8LOhwRkT3MbKZzTiemBoRbXSUHtmB9KVc+/QU7a+p45LJRjOubGXRIIh1eU+sptfy0IV1T4vjeyf34aNFmPl2mG2hFRETaoyHZKfzj+nF07RTHxCen8/IXehiqSFuh5KeNueq43uRmJHDX2wuprtWIMSIiIu1RTloCr103jjG9M7jt9fnc9to8dtbUBR2WSNhT8tPGxEZF8ouvDWJlUTl/+Xh50OGIiIjIQUqJj+aZb4/meyf14+UZ6zj/4SmsK9Z9vSJBUvLTBp18RBcuGJXDXz7O59+LNgcdjoiIiBykyAjjltMH8vjleazZWsFX//xfPl6iul0kKEp+2qi7zx3C0OwUbnp5Dqu2lAcdjoiIiByCrwzqwjvfP56ctAS+/fQMHvhwKXW72u6gUyIdlZKfNiouOpKHLh1JVKTx3edm6IFpIiIi7VzPjATeuH7cnt4dVzw1ncIdeh6QyOGk5KcNy0lL4C8XjyS/sIxbX59HWx6WXERERA4sLjqS+y44inu/PpRpq4o5/Q+f8d78jUGHJRI2lPy0ccf1z+TWM47gnXkbefy/q4IOR0RERFrARaN78s73jyMnLYHrXpjFTS/PobSyJuiwRDo8JT/twHdP6MOZQ7rym/cWMyV/S9DhiIiISAvo3yWZN64fxw9P6c9bczdwxh8/Y/Jy1fMirUnJTztgZtx3wVH0yUriey/OZkNJZdAhiYiISAuIjozgplMH8MZ144iPieTSJ6Zx51sLqazWM4FEWoOSn3YiKTaKRy4bRXXtLq57fiYV1RoAQUREpKM4qkcq73z/eK4Yl8vTU1Zz1p//y//U20OkxSn5aUf6ZiXxhwuHM399KZc/MZ3tO9U3WEREpKOIj4nkznMG88LVY6jb5bjk8Wnc+NJsinZUBR2aSIeh5KedOXVQF/76rZHMWVfCJY9NY1t5ddAhiYiISAs6tl8mH950Aj84uR/vzN/IyQ9M4rmpa/RcIJEWoOSnHTpraDcevXwUSzfv4KJHp+oZASIiIh1MXHQkPzptIO/feAJDs1P4+ZsL+PpDU1iwvjTo0ETaNSU/7dTJR3Th6SuOZt22Ci58ZCrrNQiCiIhIh9M3K4kXrh7DHy8czvptFZzz18n8/M0FbC1TVziRg6Hkpx0b1y+T564azZayKr758Oes3lIedEgiIiLSwsyMc0dk858fjeeSMb34+/S1nHjfJP72ST47azQqnEhzKPlp50b1SufF7xxDRXUt33zkc5Zv3hF0SCIiItIKUhKiufvcIXxw4/Ec0yed+z5Yysn3T+L1mQXs0v1AIk1ywOTHzJ40s0IzW9DAvFvMzJlZpv+7mdmfzSzfzOaZ2ciQZSea2XJ/mtiyuxHehmSn8Mp3xwLw9Yem8K95GwKOSERERFpLv87JPD7xaF78zjFkJMVy86tzOfsvkzU0tkgTNKXl52ngjPqFZtYDOBVYG1J8JtDfn64BHvKXTQfuAMYAo4E7zCztUAKXvfXvkszr142jX+ckvvf32dz22jw9C0hERKQDG9s3g3/ecCx/umg4pZU1XPL4NL712FSmrtwadGgibdYBkx/n3GdAcQOz/gDcCoS2s04AnnWeqUCqmXUDTgc+cs4VO+e2AR/RQEIlh6ZHegKvfHcsN5zUl1dmruPsv0xm4QaNCiMiItJRRUQYE4Zn85+bT+RnXz2SZZvLuOjRqVz4yOdMWbEF59QdTiTUQd3zY2bnAOudc3PrzcoG1oX8XuCX7a9cWlh0ZAQ/Pv0IXrhqDOVVtZz3tyk89b9VOvmJiIh0YHHRkVx9fB/+e+tJ/OLsQazaUs63HpvGhY9MZfJyJUEiuzU7+TGzBOCnwC8amt1AmWukvKH1X2NmM8xsRlFRUXPDE9+4fpm898MTOGFAJne9vYirnpnBFg2LKSIi0qHFx0Ty7eN689mtJ3HXOYNZW1zBpU9M4/yHP+f9BZv0oFQJewfT8tMX6A3MNbPVQA4wy8y64rXo9AhZNgfY0Ej5Ppxzjzrn8pxzeVlZWQcRnuyWnhjDY5fncdc5g5mcv4WT7p/EY5+tpLp2V9ChiYiISCuKi45k4rhcJv14PHdPGMym0p1c+/xMTrp/Ek//bxXlVbovWMJTs5Mf59x851xn51yucy4XL7EZ6ZzbBLwFXO6P+nYMUOqc2wh8AJxmZmn+QAen+WXSysyMieNyefcHx5PXK4173l3MaX/4lA8XblITuIiISAcXFx3JZWNz+fTH43nwkpFkJsVw59uLOOY3/+E37y5mgx6SLmGmKUNdvwh8Dgw0swIzu6qRxd8FVgL5wGPA9QDOuWLgbuALf/qlXyaHSb/OSTx15WievvJooiIjuOa5mVz6xDQWb9wedGgiIiLSyqIiIzhraDfeuP5Y3rh+HCcMyOKx/67k+N99wg1/n8WUfN0XJOHB2vIXPS8vz82YMSPoMDqcmrpdvDh9Lb//aBnbK2u48OiefP/kfnRPjQ86NBFpo8xspnMuL+g42iLVVdJeFWyr4Jkpq3llRgGllTXkZiRw0eienD8qh8yk2KDDE2mWptZTSn7CWGlFDX/6z3Ke/Xw1ABOGZ/PdE/swoEtyoHGJSNuj5Gf/VFdJe7ezpo73FmzkxWnrmL66mOhI47RBXbl4dE/G9c0gIqKhcatE2hYlP9JkBdsqeGLyKl6avo7KmjpOOaIz3z2xL0fnpmGmE56IKPlpjOoq6UjyC3fw4vR1vD6rgJKKGrJT45kwvDvnjcimvy6OShum5EeabVt5Nc9NXcPTU1ZTXF7NiJ6pXHN8H74yqAvRkQf1SCgR6SCU/Oyf6irpiHbW1PHBwk28MWs9/11exC4Hg7t34rwR2XztqO506RQXdIgie1HyIwetsrqO12YV8NhnK1lbXEFmUizfGJnNBXk96Nc5KejwRCQASn72T3WVdHRFO6r417wNvDl7PXMLSokwGNc3k7OGduO0wV10f5C0CUp+5JDV1u3i02VFvPzFOj5eUkjtLseoXmlcmNeDrw7rRmJsVNAhishhouRn/1RXSThZUVTGP2ev559zN7BmawURBqN7p3PmkG6cPrgrXVPUIiTBUPIjLapoRxVvzCrg5RnrWFlUTkJMJGcM6cpXh3bjuP6ZxEZFBh2iiLQiJT/7p7pKwpFzjkUbt/P+gk28t2AT+YVlAIzsmcoZQ7pyypFd6JOZqHuH5bBR8iOtwjnHrLXbePmLdby/YBPbd9aSHBvFKUd25syh3ThxQBZx0UqERDqatp78mNmTwNlAoXNuSL15twD3AVnOuS3m/Tf2J+AsoAK4wjk3y192IvAz/62/cs49c6Btq64S8QZKeG++lwgt8p8h2CsjgZOP6MwpR3RhdO90YqJ0/7C0HiU/0uqqa3cxZcUW3pu/iQ8WbaKkooaEmEhOPqIzpw7qwgn9s0hLjAk6TBFpAe0g+TkBKAOeDU1+zKwH8DhwBDDKT37OAr6Pl/yMAf7knBtjZunADCAPcMBM/z3bGtu26iqRvRVsq+CTJYV8vKSQKSu2UlW7i8SYSI7vn8X4gVkc1z+TnLSEoMOUDqap9ZRu2pCDFhMVwfiBnRk/sDO/qhvCtJXFvDN/Ix8u3MS/5m3EDIb3SGX8gM6MH5jF0OwUPStARFqFc+4zM8ttYNYfgFuBf4aUTcBLkhww1cxSzawbMB74yDlXDGBmHwFnAC+2YugiHU5OWgKXjc3lsrG5VFbXMWXFFv6zpJBPlhTy/sJNAORmJHBc/0yO65fJ2D6ZpCREBxy1hAslP9IioiMjvJNY/0x+de4Q5hWUMGlpEZOWFfHH/yzjD/9eRkZiDCcMyGJs3wzG9skgJy1efYFFpNWY2TnAeufc3HrnmmxgXcjvBX7Z/spF5CDFx0RyypFdOOXILjjnyC8sY3L+FiYv38I/Zq3n+alriTAYmpPK2D4ZjOmdzqjcNDrFKRmS1qHkR1pcZIQxomcaI3qmcdOpA9haVsV/l29h0tJCPl1WxD9mrwege0ocx/TJ2DP1SFcyJCItw8wSgJ8CpzU0u4Ey10h5Q+u/BrgGoGfPngcZpUh4MTP6d0mmf5dkrjy2NzV1u5izroTJy7fwv/wtPDF5JQ9/uoIIg0HdOzE6N4PRvdMZ3TuddHWjlxai5EdaXUZSLOeOyObcEdk451heWMbUlVuZtrKYT5cV8YafDHXtFMfwHqmM6JnK8B6pDM1JISFGX1EROSh9gd7A7lafHGCWmY3Ga9HpEbJsDrDBLx9fr3xSQyt3zj0KPArePT8tG7pIeIiOjODo3HSOzk3nplMHUFldx+x125i2spjpq4p5YdoanvzfKgD6ZCYyomcaI3ulMrJnGgO6JBOprvRyEPSfpRxWZsaALskM6JLM5WNz9zSBT11VzBeripmzrmRPf+DICGNgl2RG9ExlWE4Kg7un0L9LkobVFpEDcs7NBzrv/t3MVgN5/oAHbwHfM7OX8AY8KHXObTSzD4Bfm1ma/7bTgNsPc+giYSs+JpJxfTMZ1zcTgKraOuYXlDJ9dTGz1pQwaWkhr88qACApNoqjeqQwvEcqw3K8/xO6dopTDxI5ICU/EqjQJvDLjukFwNayKuasK9kzvTVnAy9MWwtAdKTRr3Myg7t38qcUBnZNJiVefYNFwpmZvYjXapNpZgXAHc65J/az+Lt4I73l4w11fSWAc67YzO4GvvCX++XuwQ9E5PCLjYokLzedvNx0wHvcxtriCmat3cbstSXMWruNhz9dSd0ur/E1KzmWYdkpDM1JYVhOCkO6p5CVHKuESPaioa6lzdu1y7GmuIKFG0pZuGE7CzdsZ9GGUraUVe9ZpltKHAO6JDOwazL9OycxsGsy/TonqducSAtp60NdB0l1lUhwdtbUsWjjduYXlDKvoJR5BSXkF5Wx+9/bzKQYjuzWiUH+BdNB3TrROzNRXeY6IA11LR1GRITROzOR3pmJnD2sO+Bd/SncUcXCDaUs3VTGss07WLZ5B59P2Up17a49781OjadPViJ9s5Lo6//sk5VEl066EiQiItLexUVHMrJnGiN7pu0pK6+qZcH6UhZt3M6iDdtZtHE7T05eRU2d898TsacL/hFdvQunA7skq5UoTCj5kXbJzOjSKY4uneI4+Ygue8pr63axtrjCT4bKWFlUxoqicl6ZsY6K6ro9yyXERNIrI5Fe6Qn0ykwgNyORXhkJ9MpIpGunOF0REhERaacSY6MY0yeDMX0y9pRV1+4iv7BsT0K0dPN2Ji0t4rWZBXuWSUuIZkAXr+dI/85J9OucTP8uSXRWUtShKPmRDiUqMoI+fuvOGUO+LHfOsXl7FSuKylhRVMaqLeWs2VrB8sIdfLykkOq6L1uLoiON7qnx5KTFk5Oa4P1MjycnLYFuKV7CFR0ZEcDeiYiIyMGIiYpgUHev+xujvizfWlbF0s07WLrJ60GydNMO3p67ge07a/cskxwbRd/OSfTrnETvzET6ZiXSJyuJnukJxEVrEKb2RsmPhAUzo2tKHF1T4ji2X+Ze8+p2OTaWVrJ2awWrt1ZQsK2CddsqKdhWwcdLCynaUbXX8hHm3VTZLSWe7qlxdEuJp5u/7i6d4ujaKY6s5FidEEVERNq4jKRYxiXF7hlhDrwLpkU7qsgvLGN5YZn/cwefLdu7pcgMctLi6Z2ZRJ9MrwfJ7p4kOWkJxETpQmlbpORHwl5khJGT5p2oxvXbd/7OmjrWl1RSsK2SjSWVbCjdyYaSSjaWVrJko9dytLNm1z7vS0uIpkunODp3iiMrKZas5Fg6J3s/d0+ZSbF0iotSc7qIiEgbYWZ09uvvcfUumO7YWcPqLRWs3FLGyqJyVm4pZ2VRGbPWbKOs6svWogiD7LR4cjMS6ZmesGfqkZ5Az4wEOsVplNqgKPkROYC46Eh/wISkBuc75yitrGHz9io2bd/J5u072Vy6k807drKptIqiHTvJ37yDorKqPTdbhoqJjCAjKYaMpBgyk2LJSIwlMymG9MQY0hJjyEj0Xu+ekmKVLImIiAQhOS6aoTnecNqhnHNsLa9mzdZyVm+pYPXWclZvrWDN1nLemb+RkoqavZZPTYimZ7rXtb5Hmt/FPi2BHunxZKcmEB+j3iOtRcmPyCEyM1ITYkhNiGFg1+T9Lrc7SSrcUUWRP20pq2JLWTVbyqrYWlbF1vJqlm3awZay6r3uQwoVExlBakI0aQkxpCXu/hlDml+WmhBDanw0qQnelBIfQ0p8tJrfRUREWomZkZnk9egY1St9n/mllTWsK65gXXEFa0OmJRt38O/FhXuNVAuQkRhDdlo83VPivZ+p8WT7U7fUODISY3Qh9CAp+RE5TEKTpAFd9p8kgZcolVfXUVxWTXFFNcXlVWwtq2ZbRTVby6spKa9hW0U1JRU1LC8so6Simm0VNXse9NaQhJhIUuKj6RQX7f2Mj6ZTfBQp8dEkx0XTKS6KTnHRJMdFeb/Hez+TYqNIjosiNipCJ1oREZGDkBIfTUp2CkOyU/aZt2uXY0tZFeu2VVCwrZJ1xRWsL9nJ+pJK8ovK+HRZEZU1dXu9JyYqgm4pcf7k3XvcLTWerv69x11SYslMjOX/2bvz+CrKs//jn+ucLEBC2AIBEiDssogbi2vF1gX3vUqte6u21ra/2uep7dPW1uqjbZ/W1tZateJe12q1uFKr4oayiYDsiBLWQNgDWa/fHzOBQ0hYk0xyzvf9ep3XOeeeOTPX3Odk7lwz99wT0+i1u1DyI9IMmRnZmWlkZ6bRs1ObvfqMu7OprJINpRWsLw2To60VbAgTo41bK9gQPjZuq2DZ+q3MWRGUb0rop1yftJiR3Spte1zZmWlkbX+OJ7wOHxlx2mTsmJaVkUabjDhtMoL3SqZERESC+xnWXGN0RK9dp7s760uDdnvZ+uD64xUbtrF8wzZWrN/KR5+VsHLjtl0OgKbFjC5tM8lrFyZE4YBMeTmtgvKcVuTlZNKudXpKtcdKfkSShJmR0yo4s9Nj1zPutOPPOgAAIABJREFUu1VV7Wwuq2TTtgo2batk07bKMCmqYHNZFZu3VbK5rILN2yrZVBZM31JWyfrScorWlbKlrIotZZVsLq/cflftPYkZtMlIo3WYELVOj4fJ0Y6yoDyN1hkxWqfHaZVeM33H+1bp8e2vg+cYrTLitEqLkx63lNqhi4hI8jGzoHt7VkadZ44gaMfXbC5j5YZt268/Tnw9f9Um3l24hk3bdj3YmRGPBYMwtc2kc3YmXXIytw/UlJudSee2Gdu79GVltvzUoeVvgYgcsHjMglPyrQ9s9Bl3Z2tFFVvKqigtrwySovIgUSotDxKkmulbyyvZUl5FaXkwb2l5FVvD12s2l7GtompHWUXVbrv01SdmbE+QWqXFaJUeJyN8bpUePqfFyUyP0SptR1lmWozMmue0GJnhPDWvM8LyjIT3GWkxMuKxHdPiMXU3EBGRJhGP7bj5+yG7mW9reRWrN21j1cay7c/Fm4LXxZvKKFpXyvQv1rF2S3mdn2+dHic3IRnqlBUM2NQpK3P7wE0dw8GaOmRlNMv7Iir5EZEGY2a0yUijTUYakNmgy66oqqa0vIptFUFCtDVMjsoqgtfbKqrD58RHdfBcmfC6opqyyirKKqop2VK+U1nic0NIi9lOiVF6fEfSlFhWO3nKiMdIT7NgWsI86fEY6XELn2OkxW379JrXteereZ/fvrUGvRARSXGtM+L06pRFr05Zu52voqqateGATMWby1izaccATTWPpSWlfLx0PSVbyus9QNm2VVrCqLWZdMxKp0NWBh3DwZo6Jgze1L196ya5R6KSHxFpEdLjMdq1jh3w2am94e6UV1VTVllNWUJCVF5ZHZRXVFFWGbwvq6ymvKoqmLb9ffX29+W131ft/FxRVU1paSVl4eua8soq3zFfVfVedyeszxs3Hl/vcO0iIiKJ0uOx7TeH35Pq6mA027VbyijetGNwppLN4YBNW4KyonWlzFxWzrotFXWOaPvo1SM5rn/nxticnSj5ERGpxcyCrm5pcdjzfr9JVFUHyVBFdTUVldVUhu8rqz1ImsLkqaLKtydRFWHyVFFVTZe2DXsmTkREBIIBG2quSerXZc/z14xouy5MikrC54O65jR+sOxF8mNm44AzgNXuPjQs+y1wJlAOLAKudPf14bQfA1cDVcB33f21sHwM8EcgDvzN3e9o+M0REUlO8ZjROiNOa3TjOxERabkSR7Tt0XHvRrRtSHvTAfwhYEytsgnAUHcfBswHfgxgZoOBi4Eh4Wf+YmZxM4sDdwOnAoOBseG8IiIiIiIiTWKPyY+7TwRKapW97u41Y+VNAgrC12cDT7p7mbt/BiwERoaPhe6+2N3LgSfDeUVERERERJpEQwz9cxXwSvg6H1iaMK0oLKuvXEREREREpEkcUPJjZv8DVAKP1xTVMZvvpryuZV5jZlPMbEpxcfGBhCciIiIiIrLdfic/ZnY5wUAIl7hvH4S1COiRMFsBsHw35btw9/vcfbi7D+/cufGHuxMRERERkdSwX8lPOHLbj4Cz3L00YdKLwMVmlmlmvYH+wEfAZKC/mfU2swyCQRFePLDQRURERERE9t7eDHX9BDAayDWzIuBmgtHdMoEJZgYwyd2vc/fZZvY08ClBd7jr3b0qXM53gNcIhroe5+6zG2F7RERERERE6mR+oLcNb0RmVgx8foCLyQXWNEA4yUB1sYPqYgfVxc5UHzsk1kUvd1df5Do0QFul39wOqosdVBc7qC52pvrYYZ/bqWad/DQEM5vi7sOjjqM5UF3soLrYQXWxM9XHDqqLpqF63kF1sYPqYgfVxc5UHzvsT100xFDXIiIiIiIizZ6SHxERERERSQmpkPzcF3UAzYjqYgfVxQ6qi52pPnZQXTQN1fMOqosdVBc7qC52pvrYYZ/rIumv+REREREREYHUOPMjIiIiIiKSvMmPmY0xs3lmttDMboo6nqZmZuPMbLWZzUoo62hmE8xsQfjcIcoYm4qZ9TCzN81sjpnNNrPvheUpVx9m1srMPjKzGWFd/DIs721mH4Z18VR4M+KUYGZxM5tuZuPD9ylZF2a2xMxmmtnHZjYlLEu5v5GmpHZK7VQNtVM7qJ3aldqpQEO1U0mZ/JhZHLgbOBUYDIw1s8HRRtXkHgLG1Cq7CXjD3fsDb4TvU0ElcKO7DwKOBK4Pfw+pWB9lwJfd/RDgUGCMmR0J/Bq4M6yLdcDVEcbY1L4HzEl4n8p1cYK7H5owbGgq/o00CbVTgNqpRGqndlA7tSu1UzsccDuVlMkPMBJY6O6L3b0ceBI4O+KYmpS7TwRKahWfDTwcvn4YOKdJg4qIu69w92nh600EO5B8UrA+PLA5fJsePhz4MvBsWJ4SdQFgZgXA6cDfwvdGitZFPVLub6QJqZ1SO7Wd2qkd1E7tTO3UHu3z30iyJj/5wNKE90VhWarLc/cVEOxogS4Rx9PkzKwQOAz4kBStj/D0+cfAamACsAhY7+6V4Syp9PfyB+C/gerwfSdSty4ceN3MpprZNWFZSv6NNBG1U3VL+d+c2im1U7WondqhQdqptEYMMEpWR5mGtUtxZpYN/AP4vrtvDA6epB53rwIONbP2wPPAoLpma9qomp6ZnQGsdvepZja6priOWZO+LkLHuPtyM+sCTDCzuVEHlORS+bcm9VA7FVA7FVA7tYsGaaeS9cxPEdAj4X0BsDyiWJqTVWbWDSB8Xh1xPE3GzNIJGpTH3f25sDhl6wPA3dcDbxH0L29vZjUHQ1Ll7+UY4CwzW0LQ5ejLBEfYUrEucPfl4fNqgn82RpLifyONTO1U3VL2N6d2aldqp9ROJWqodipZk5/JQP9wNIwM4GLgxYhjag5eBC4PX18OvBBhLE0m7B/7ADDH3X+fMCnl6sPMOodH0jCz1sCJBH3L3wQuCGdLibpw9x+7e4G7FxLsI/7j7peQgnVhZllm1rbmNXAyMIsU/BtpQmqn6paSvzm1UzuondpB7dQODdlOJe1NTs3sNILsOA6Mc/fbIg6pSZnZE8BoIBdYBdwM/BN4GugJfAFc6O61LzZNOmZ2LPAOMJMdfWZ/QtCfOqXqw8yGEVwQGCc4+PG0u99iZn0Ijip1BKYDX3f3sugibVphd4IfuvsZqVgX4TY/H75NA/7u7reZWSdS7G+kKamdUjtVQ+3UDmqn6qZ2quHaqaRNfkRERERERBIla7c3ERERERGRnSj5ERERERGRlKDkR0REREREUoKSHxERERERSQlKfkREREREJCUo+RERERERkZSg5EdERERERFKCkh/Za2b2lpl9owGWMzu8WVeT2dM6G2rbmoqZXWJmr0cdx/4ws4fM7NYD+PwrZnb5nucUEdmZ2rHt8442s6IGjO2A9uuNycx+YWaPHcDn/2pmP2vImCRaaVEHIAfGzJYAeUAVsAV4GbjB3TdHGdfuuPuQmtdm9gugn7t/PdnW2Zjc/XHg8Zr3ZuZAf3dfGF1UTcPdT406BhFpOGrHmu8662JmVwDfcPdjm3rdUXD366KOQRqWzvwkhzPdPRs4HBgB/HRfF2BmSoSbEQtE8vfZXH8LjVUnUda1iGyndixFNefvzcziLWm5snfU4CcRd18GvAIMBTCzdmb2gJmtMLNlZnZrzR+cmV1hZu+Z2Z1mVgL8IqHsT2a2wczmmtlX6lufmV1lZnPMbJ2ZvWZmvcLyo81sjZn1CN8fYmbrzeyg8P0SMzvRzMYAPwEuMrPNZjbDzC40s6m11nOjmf2zjvWfYGYzE97/28w+Snj/rpmds6d1JiyyV7j9m8zsdTPLrWe7c81sfLhNJWb2Ts0/z+F6fmxmn4b18qCZtQqndQg/VxxOG29mBQnLfcvMbjOz94BSoE/4nSwOY/rMzC5J+P7eDV9PDBcxI9ymi8xslpmdmbDs9PA7ObSO7RltZkVm9iMzWwk8GJafYWYfh9v5vpkNS/jM4WY2PYzrGTN7ysIuD4mxJczvZtavjnXvT51s79oR/mY2Jzzcwm4hZnZkGPf6cL7Ru1tuXd+1iDQttWNN044lLP8n4XYuqWlfEur9kXDf/LmZ/dTMYmY2CPgrcFS47vUJi+tgZi+F6/7QzPrWs87CcF99tZl9AfwnLN/dPru3mU0Ml/1vM7vbwq5sVkcXvpq6qmf9z5jZyvD3MdHMEs+oPWRm95jZy2a2BTjBErr0mdm/arU51RacCcPMDjKzCRb8XzDPzL66h+WeZsH/CpvC3/YPd/ddSQNydz1a8ANYApwYvu4BzAZ+Fb7/J3AvkAV0AT4Crg2nXQFUAjcQdH9snVD2/4B04CJgA9Ax/MxbBKe6Ac4BFgKDws//FHg/Ia7bCHZorYFPgO/UE/MvgMcSpmUCJcCghLLpwPl1bHsrYCuQG8awElgOtA3XuxXotKd1JmzbImBA+Nm3gDvqqfPbCXb+6eHjOMAS1jMr/C46Au8Bt4bTOgHnA23CGJ8B/lkrhi+AIeH2tAM2AgPD6d2AIQnf37sJn3WCLhA17/8beCrh/dnAzHq2Z3T4vf86rP/WBEdfVwOjgDhwebhtmUAG8DnwvXD7zwPKE7Zzp9hqxwc8dAB1kk7C77DWOq4B5gI5QD6wFjiN4CDPSeH7zvUtN+q/ZT30SNUHaseiaMdGh/X0+zDe4wm6HNa0N48AL4RxFALzgasT6r32Pv6hcJtHhtvxOPBkPesuJGgTHgm/19bseZ/9AfB/BO3PsQRt42MJ21K0m99U7e/nqnC7MoE/AB/X2o4NwDFhHK1IaLNqrWNM+F31CLdjKXBluP2HA2vY0WbXtdwVwHHh9A7A4VH/LabKQ2d+ksM/w6Mv7wJvA/9rZnnAqcD33X2Lu68G7gQuTvjccnf/k7tXuvvWsGw18Ad3r3D3p4B5wOl1rPNa4HZ3n+PulcD/AofWHDUj2Nm0I2iolgN3782GuHsZ8BTwdYDwiEwhML6OebcBU4AvAcMJGqd3CXYuRwIL3H3t3qw39KC7zw/r4mlgl7MkoQqCRKRXWE/veLj3Cv3Z3Ze6ewlB4zk2jHetu//D3UvdfVM47fhay37I3WeHdVoJVANDzay1u69w99l7uS2PAaeZWU74/lLg0d3MXw3c7O5l4fZ/E7jX3T909yp3fxgoI6jXIwl27neF2/8cwfe8z/a1Tty9oq7lmNmxwK3AWe6+keD387K7v+zu1e4+geC3ctq+LFdEmozasaZtx2r8LNzvvw28BHzVgjNrFwE/dvdN7r4E+B1BO7I7z7n7R2FdPr4X6/5F+L1uZTf7bDPrSdAV8ufuXu7u7wIv7mHZ9XL3ceF2lRF8x4eYWbuEWV5w9/fCOLbVtQwzG0CQvF3k7kuBM4Al7v5g+FucBvwDuGA3y60ABptZjruvCz8jTUDJT3I4x93bu3svd/92uCPpRXDUa0V4Cnk9wdGzLgmfW1rHspbV+kf+c6B7HfP1Av6YsOwSwAiO3hD+M/kQQdeF39Va5p48DHzNzIxgZ/t0uJOqy9sER32+FL5+i+Cf5+PD9/tiZcLrUiC7nvl+S3C08HULuqTdVGt6Yr1urz8za2Nm94ZdCDYCE4H2tnPf3+2fdfctBA3QdQTf40s1XS72xN2XE5x1Ot/M2hP8A/H4bj5SXGsn3wu4seb7Db/jHuG2dGfX30ldv6U92tc6qWcZPQga+cvdfX5C/BfWiv9YgqT1gGIWkUahdqxp2zGAdWE7U6OmnnLZcYY/cVp+A64bdv7udrfP7g6UuHtpPZ/da2YWN7M7zGxR2OYsCScldg/cU5vTjuCs2M/c/Z2E+EfViv8SoOtulns+wQG5z83sbTM7an+2Sfadkp/ktZTgSH1u2KC0d/ccTxgthuC0c2354c66Rk+CI151Lf/ahGW3d/fW7v4+gJnlAzcTXD/yOzPLrCfOXWJw90kE3aiOA77G7s9Y1G403mbPjca+NGC7fjg4YnSju/cBzgR+YDv3Ke+R8Dqx/m4EBgKj3D0njBmCxrbO2Nz9NXc/iaABmAvcvw+hPkxwNO1C4AMP+tLXu1m13i8Fbqv1/bZx9ycITtXX/p0kbvMWgm5sAJhZ4s6/tn2uk0Rm1pqgW8wf3P2VWvE/Wiv+LHe/Y2+WKyLNgtqxRmrHQh3MLCvhfU09rSE4K9Gr1rSaNqSh9p21D6DVt89eAXQ0szYJ8++uzYkDnetZ59cIuoGfSHBWr7DmY/XEtRMLru/9O/Cmu99bK/63a8Wf7e7fqm+57j7Z3c8mSOb/SXAQT5qAkp8k5e4rgNcJdtg5Flyo2NfMancpqq0L8F0LLpC/kKAv9Mt1zPdX4Mc1FwpacHHkheFrIzha9gBwNcGO61f1rG8VUGi7jrb1CPBnoDI8xV2f9wn+eR4JfORBt7BeBNeqTKznM/Wtc69YMBBAv3A7NxIMz1qVMMv1ZlZgZh0JLkp9KixvS9B/e3047eY9rCfPzM4KG6cyYHOt9dTeptoX7f+ToN/x9wjqc1/cD1xnZqMskGVmp5tZW4K+11XAd8wszczOJqj/GjOAIWZ2qAWDPfxiN+vZpzqpwzhgrrv/plb5Y8CZZnZKeKSvlQUXxRbUsQwRaYbUjjVeO5bgl2aWYWbHEXTdesbdqwj+Eb/NzNpa0A3wBwT71Zp1F5hZxgGuO1G9+2x3/5ygC9wvwliPIjjwWGM+0Cpso9IJrt2qL1FtS9CeriVImP53H+O8jeD6nu/VKh8PDDCzS8PfXbqZjbBggIhdhNtxiZm1C88w1vwvIU1AyU9yu4zg1PWnwDrgWXbu9lOXD4H+BEd+bgMuqKu/sbs/T3CB/JPhqeNZBF2rAL5LcM+Gn4XdBK4Ergx3rrU9Ez6vNbPE/q6PEnQ12N3RspquYdOA2e5eHhZ/AHzuQf/wutS3zr3VH/g3QTLyAfAXd38rYfrfCRrsxeGj5sZvfyC4sHMNMAl4dQ/riRGcGVlO0B3jeODb9cz7C+Dh8HT7VwHCbiP/AHoDz+311gWfnUJw3c+fCX47CwkuciWs5/MI/iFYT3B2aTxBg0LY9ewWgjpaQNB/vT77Wie1XQycazuPvnNc2Af7bILks5jgqNx/oX2eSEujdqxuB9qOQdBNbR1BG/M4cJ27zw2n3UBwRmUxwT787wQHmyAYBGI2sNLM1uznuneyF/vsS4CjCJKWWwkOKta0ORsI2sa/EZyd2gLUdwPXRwi68C0j+E1N2sdQxxJci7Uuoc25xINrVk8maJOWE9RtzSBC9bkUWBL+9q4jvEZMGl/NCFUizerGZWF3ptUEo58siDqevWXBzfq+4e7/jjoWADP7OTDAG/lGeGb2IfBXd3+wMdcjIrI7asdSg5k9RXDWf197C4joKKg0W98CJqvB2H9hN7KrgfsaYdnHm1nXsNvb5cAw9v2sjYhIMlM71kDCLmR9w66PYwjOEu1y3ySRvdFs76orqSs8e2IE92CQ/WBm3yToUvaou9fXZ/xADCToE55NcF+JC8L++SIiKU/tWIPrStB9uxNBl7Zvufv0aEOSlkrd3kREREREJCWo25uIiIiIiKQEJT8iIiIiIpISmvU1P7m5uV5YWBh1GCIiKW/q1Klr3L2+GwemNLVVIiLR29t2qlknP4WFhUyZMiXqMEREUp6ZfR51DM2V2ioRkejtbTulbm8iIiIiIpISlPyIiIiIiEhKUPIjIiIiIiIpQcmPiIi0eGY2zsxWm9msOqb90MzczHLD92Zmd5nZQjP7xMwOT5j3cjNbED4ub8ptEBGRxqfkR0REksFDwJjahWbWAzgJ+CKh+FSgf/i4BrgnnLcjcDMwChgJ3GxmHRo1ahERaVJKfkREpMVz94lASR2T7gT+G/CEsrOBRzwwCWhvZt2AU4AJ7l7i7uuACdSRUImISMuVtMnP0pJS/t9THzOzaEPUoYiISATM7CxgmbvPqDUpH1ia8L4oLKuvvFFsKK3gv56ZwdvzixtrFSIiUkvSJj/p8RjPT1/G1M/rOhAoIiLJzMzaAP8D/LyuyXWU+W7K61r+NWY2xcymFBfvX/LSOiPO89OX8eHitfv1eRER2XdJm/zk5WTSoU06c1ZsijoUERFpen2B3sAMM1sCFADTzKwrwRmdHgnzFgDLd1O+C3e/z92Hu/vwzp33eEPxOmWkxeidm8X8VZv36/MiIrLvkjb5MTMGd89hzsqNUYciIiJNzN1nunsXdy9090KCxOZwd18JvAhcFo76diSwwd1XAK8BJ5tZh3Cgg5PDskYzIK8tC1brIJ2ISFNJ2uQHYFDXHOau3ERlVXXUoYiISCMysyeAD4CBZlZkZlfvZvaXgcXAQuB+4NsA7l4C/AqYHD5uCcsaTb8u2XxRUsrW8qrGXI2IiITSog6gMQ3unkN5ZTWfrdlC/7y2UYcjIiKNxN3H7mF6YcJrB66vZ75xwLgGDW43BuS1xR0WFW9maH67plqtiEjKSu4zP91yAPh0hbq+iYhI8zMgLxtAXd9ERJpIUic/fTtnkxGPKfkREZFmqTA3i/S4adADEZEmktTJT0ZajH5dsjXim4iINEvp8WDEtwWr1E6JiDSFpE5+IOj69ulynfkREZHmqX+XtjrzIyLSRJI++RncPYc1m8tYvWlb1KGIiIjson9eNkvXacQ3EZGmkPTJz6BuwShv6vomIiLNUeKIbyIi0riSPvkZHI74NkeDHoiISDNUM+LbfF33IyLS6JI++WnfJoPu7Vrpuh8REWmWenXSiG8iIk0l6ZMfCAY90JkfERFpjjTim4hI00mJ5Gdw9xwWr9nCtgpdTCoiIs1P/7y2LFitMz8iIo0tJZKfQd1yqKp29acWEZFmaUCXthrxTUSkCaRE8qNBD0REpDkbkJeNOyzU2R8RkUaVEslPz45tyMqIa9ADERFplvprxDcRkSaREslPLGYc1C1H9/oREZFmqWbEN133IyLSuFIi+YHgZqdzVmzE3aMORUREZCfp8Rh9crM14puISCNLoeQnh01llRSt2xp1KCIiIrvon5fN/NVKfkREGlPKJD81gx58qkEPRESkGerfpS1LS7ZSWl4ZdSgiIkkrZZKfgV3bYoYGPRARkWZpQDjogUZ8ExFpPCmT/LTJSKN3bpaGuxYRkWapf15bABasUvIjItJYUib5geC6nzkrlfyIiEjzU9ipDRnxmK77ERFpRCmV/AzulsPSkq1s3FYRdSgiIiI7SYvH6NM5S2d+REQaUcolPwBzdb8fERFphvp1ydaNTkVEGlFKJT+DakZ8W74h4khERER2NSCvLUXrNOKbiEhjSankJy8nk45ZGczRmR8REWmGNOKbiEjj2mPyY2bjzGy1mc1KKPuVmX1iZh+b2etm1j0sNzO7y8wWhtMPT/jM5Wa2IHxc3jibs8dtYVC3thr0QEREmqWaEd/m67ofEZFGsTdnfh4CxtQq+627D3P3Q4HxwM/D8lOB/uHjGuAeADPrCNwMjAJGAjebWYcDjn4/DO6Ww9yVm6isqo5i9SIiIvXq1TEY8W2BrvsREWkUe0x+3H0iUFKrLPHUSRbg4euzgUc8MAlob2bdgFOACe5e4u7rgAnsmlA1iUHdciivrOazNVuiWL2IiEi9akZ806AHIiKNY7+v+TGz28xsKXAJO8785ANLE2YrCsvqK29y2wc90M1ORUSSRjJ10e6f15YFuuZHRKRR7Hfy4+7/4+49gMeB74TFVtesuynfhZldY2ZTzGxKcXHx/oZXr76ds8mIx5T8iIgkl4dIki7aA7pkU7RuK1vKNOKbiEhDa4jR3v4OnB++LgJ6JEwrAJbvpnwX7n6fuw939+GdO3dugPB2lpEWo1+XbI34JiKSRJKpi3bNoAca8U1EpOHtV/JjZv0T3p4FzA1fvwhcFnYpOBLY4O4rgNeAk82sQ3gU7eSwLBKDu+fw6fKNuNd58klERJJEY3XRbsxeCv3D4a513Y+ISMPbm6GunwA+AAaaWZGZXQ3cYWazzOwTgkTme+HsLwOLgYXA/cC3Ady9BPgVMDl83BKWRWJEYQfWbC7jvHveZ/KSyMIQEZFG1lhdtBuzl0LNiG868yMi0vDS9jSDu4+to/iBeuZ14Pp6po0Dxu1TdI3kwiN6YBi/mzCPC//6AScPzuNHpx5E387ZUYcmIiKN4+/ASwTX9Oyui/boWuVvNU14O2jENxGRxtMQ1/y0OLGY8dURPXjzh6P54ckDeH/RWk6+cyI//edMijeVRR2eiIg0gJbcRXtAXlvd6FREpBHs8cxPMmuTkcZ3vtyfi0f25K43FvD3D7/g+WnL+NbovnzjuD60So9HHaKIiOyFsIv2aCDXzIoIzvCcZmYDgWrgc+C6cPaXgdMIumiXAldC0EXbzGq6aEOEXbT7d8nmxRnL2VJWSVZmSjfVIiINSntUIDc7k1vOHsoVRxfy61fn8n+vz+eZqUX84qwhnDCwS9ThiYjIHiRbF+0h+cE96SYvKWG02iERkQaTkt3e6tOnczb3Xjqcx64eRTxmXPngZL75yBSWlpRGHZqIiKSQY/rlktMqjX9OXxZ1KCIiSUXJTx2O7Z/Lq9/7EjedehDvLVzDib9/m7veWMC2iqqoQxMRkRSQmRbnjEO68+rslWzWzU5FRBqMkp96ZKTFuO74vrxx4/GcODiP30+Yzyl/mMgHi9ZGHZqIiKSA8w7LZ1tFNa/OWhl1KCIiSUPJzx50a9eau792OI9dPYqYGZf8bRJ3vbGAqmrdIFVERBrPEb060LNjG56fXhR1KCIiSUPJz146tn8u4284lrMO6c7vJ8znigc/Ys1mDYstIiKNw8w497B83l+0lhUbtkYdjohIUlDysw+yMtO486JDuf28g/nwsxJOv+sdPlysbnAiItI4zj0sH3f45/TlUYciIpIUlPzsIzNj7Mie/PPbx9AmI42x90/i7jcXUq1ucCIi0sAKc7M4olcHnptWRDBCt4iIHAglP/tpcPccXvzOMZx2cDd++9o8rnp4MhtKK6IOS0TQeygzAAAgAElEQVREksy5h+WzYPVmZi/fGHUoIiItnpKfA9C2VTp/GnsYt54zlPcWruHCe99n+Xr1yxYRkYZzxrBuZMRjPDdN9/wRETlQSn4OkJnx9SN78fCVI1mxfhvn/eV95q3cFHVYIiKSJNq3yeDLB3XhxRnLqKyqjjocEZEWTclPAzm6Xy5PXXsU1e5c8Nf3maSBEEREpIGce3g+azaX886CNVGHIiLSoin5aUCDu+fw3LePJi+nFZc98BHjP9HoPCIicuBOGNiF9m3SeW66ur6JiBwIJT8NrKBDG5697iiGFbTjhiemM+7dz6IOSUREWriMtBhnDuvO67NXsnGbBtcREdlfSn4aQfs2GTz2jVGcPDiPW8Z/yu0vz9EQpSIickDOPTyfsspqXp25MupQRERaLCU/jaRVepy/XHIEXz+yJ/dOXMwt4z9VAiQiIvvtsB7t6Z2bxXPTi6IORUSkxUqLOoBkFo8Zvzp7KGmxGA++t4SYGT89fRBmFnVoIiLSwpgZ5xyaz53/nk/RulIKOrSJOiQRkRZHZ34amZlx85mDueLoQh549zP+V13gRERkP517WD4AL3ysAXVERPaHkp8mUJMAXXZUL+5/5zNuf2WuEiAREdlnPTu1YURhB56bVqR2RERkPyj5aSJmxi/PGsKlR/bivomLueNVJUAiIrLvzjo0n0XFW5irG2qLiOwzJT9NyMy45ewhwSAIby/mN6/NUwIkIiL75NShXYkZupeciMh+UPLTxMyMW84aytdG9eSetxZx54T5UYckIiItSG52Jkf17cRLn6zQATQRkX2k5CcCsZhx69lDuWh4D+76z0Ieek83QhURkb13xrDuLFlbyuzlG6MORUSkRVHyE5FYzLjt3KGcPDiPX47/lBdnqPuCiIjsnVOGdCUeM8Z/siLqUEREWhQlPxFKi8e4a+xhjCjsyI1Pf8w7C4qjDklERFqAjlkZHNMvl/GfLFfXNxGRfaDkJ2Kt0uPcf9lw+nbO5tpHp/JJ0fqoQxIRkRbgjIO7UbRuK58UbYg6FBGRFkPJTzPQrnU6j1w1ko5ZGVzx4GQWF2+OOiQREWnmThnSlfS48dJMdX0TEdlbSn6aiS45rXj06lEYcOkDH7Fq47aoQxIRaRHMbJyZrTazWQllvzWzuWb2iZk9b2btE6b92MwWmtk8MzsloXxMWLbQzG5q6u3YV+3apHNsv1yN+iYisg+U/DQjvXOzePDKEawvLefycR+xYWtF1CGJiLQEDwFjapVNAIa6+zBgPvBjADMbDFwMDAk/8xczi5tZHLgbOBUYDIwN523WzhjWnWXrtzJ9qbpMi4jsDSU/zcywgvbce+lwFhVv5rpHp1JeWR11SCIizZq7TwRKapW97u6V4dtJQEH4+mzgSXcvc/fPgIXAyPCx0N0Xu3s58GQ4b7N20pA8MuIxxs9Q1zcRkb2h5KcZOrZ/Lr8+fxgfLF7LTf/4RN0ZREQOzFXAK+HrfGBpwrSisKy+8mYtp1U6XxrQmZdnrqC6Wm2FiMieKPlpps47vIAfnDSA56Yv485/L4g6HBGRFsnM/geoBB6vKapjNt9NeX3LvcbMppjZlOLiaG9TcOYh3Vi5cRtTv1gXaRwiIi3BHpOfVL2QtDm44cv9+OrwAu56YwFPT1m65w+IiMh2ZnY5cAZwie84hV4E9EiYrQBYvpvyOrn7fe4+3N2Hd+7cuWED30dfGZRHZlqMl3TDUxGRPdqbMz8PkaIXkkbNzLjt3IM5rn8uP3luJu8uWBN1SCIiLYKZjQF+BJzl7qUJk14ELjazTDPrDfQHPgImA/3NrLeZZRC0ZS82ddz7IzszjRMGduGlmSuoUtc3EZHd2mPyk8oXkjYH6fEYf7nkcPp1yeZbj01l7sqNUYckItKsmNkTwAfAQDMrMrOrgT8DbYEJZvaxmf0VwN1nA08DnwKvAte7e1XYpn0HeA2YAzwdztsinD6sG8Wbypi8pGTPM4uIpLCGuOYnaS8kbS7atkrnwStH0CYzzlUPTtY9gEREErj7WHfv5u7p7l7g7g+4ez937+Huh4aP6xLmv83d+7r7QHd/JaH8ZXcfEE67LZqt2T9fGdSFVukxxn9Sb089ERHhAJOfxriQtDldRNqcdGvXmnFXjGDD1gqufHAym8sq9/whERFJCW0y0vjKQXm8OmsllVW6RYKISH32O/lprAtJm9NFpM3NkO7tuPuSw5m3ahPf+fs0NXAiIrLdGcO6sWZzOR9+pq5vIiL12a/kJ5UuJG1uRg/swq3nDOWtecX8/MXZugeQiIgAQfvQJiPOv2ao65uISH32ZqjrlL+QtLkZO7In3x7dl79/+AV/fXtx1OGIiEgz0DojzpihXXnpkxVsLa+KOhwRkWYpbU8zuPvYOoof2M38twG7XCjq7i8DL+9TdFKvH548kKXrtvLrV+dS0KE1Zx7SPeqQREQkYhccUcBz05bx+qcrOftQjSskIlJbQ4z2JhGIxYz/u3AYIws7cuPTMzS8qYiIcGTvThR0aM2zU4uiDkVEpFlS8tOCZabFuffSIyjo0JpvPjKFxcWbow5JREQiFIsZ5x9ewLsL17Bs/daowxERaXaU/LRwHbIyeOjKkcTNuOLByazdXBZ1SCIiEqELjijAHZ6fprM/IiK1KflJAj07teH+y4ezauM2rn54ii50FRFJYT06tuHIPh15dmqRRgQVEalFyU+SOLxnB/548WHMKFrPDU/oHkAiIqnsgiN6sGRtKVM+Xxd1KCIizYqSnyQyZmhXbjlrCP+es5qfvTBLR/xERFLUqUO70iYjzrNT1PVNRCSRkp8kc+lRhVx/Ql+e+Ggpf3xjQdThiIhIBLIy0zj94G68NHMFpeWVUYcjItJsKPlJQj88eSDnH17AH/69gCc++iLqcEREJAIXHFHA5rJKXpu9MupQRESaDSU/ScjMuOP8gzl+QGf+5/mZvDFnVdQhiYhIExtR2JGeHdvwjLq+iYhsp+QnSaXHY/zlksMZmt+O6/8+jWlf6KJXEZFUUnPPn/cXraVoXWnU4YiINAtKfpJYVmYa464YQV5OK65+aDKLdBNUEZGUcv4R+QA8N21ZxJGIiDQPSn6SXG52Jg9fOZKYGZc98BHLdcdvEZGUUdChDUf37cSzU4uortYIoCIiSn5SQGFuFg9fNZKNWyu49IEPWbu5LOqQRESkiVxwRAFflJQyeUlJ1KGIiEROyU+KGJrfjr9dPpyidVu54sHJbNpWEXVIIiLSBMYM7Up2ZhrPTtXAByIiSn5SyKg+nfjr149gzoqNXP3wFLZVVEUdkoiINLI2GTvu+bOlTPf8EZHUpuQnxZxwUBd+f9GhTF5SwvWPT6OiqjrqkEREpJFdMLyA0vIqxn+yPOpQREQipeQnBZ11SHduPWcob8xdzQ+fmaGLYEVEktzwXh04qGtbHnxvCe7a54tI6lLyk6IuGdWL/x4zkBc+Xs7NL85WYygiksTMjCuOLmTuyk1MWqyBD0QkdSn5SWHfOr4v136pD49O+pw7XpmrBEhEJImdc1g+Hdqk89D7n0UdiohIZNKiDkCiY2bcdOpBlJZXce/ExaTFjR+ePBAzizo0ERFpYK3S44wd2ZO/vr2IpSWl9OjYJuqQRESanM78pDgz45dnDWHsyJ7c/eYi/vDvBVGHJCIijeTSo3phZjzywZKoQxERiYSSHyEWM247ZyhfHV7AH99YwJ/eUAIkIpKMurVrzZihXXly8lINey0iKUnJjwBBAnT7ecM477B8fjdhPn95a2HUIYmI7BUzG2dmq81sVkLZhWY228yqzWx4rfl/bGYLzWyemZ2SUD4mLFtoZjc15TY0pauOKWTTtkqem6abnopI6lHyI9vFY8ZvLzyEsw/tzm9encf9ExdHHZKIyN54CBhTq2wWcB4wMbHQzAYDFwNDws/8xcziZhYH7gZOBQYDY8N5k87hPTswrKAdD72/RLc6EJGUo+RHdhKPGb+78BBOH9aN216ew7h3NSqQiDRv7j4RKKlVNsfd59Ux+9nAk+5e5u6fAQuBkeFjobsvdvdy4Mlw3qRjZlx5TCGLirfwzsI1UYcjItKklPzILtLiMf5w0aGMGdKVW8Z/yr1vL4o6JBGRhpIPLE14XxSW1VeelE47uBu52Zk8+J4OcIlIalHyI3VKj8f409cO44xh3bj9lbncOWG+7gMkIsmgrrH8fTfldS/E7Bozm2JmU4qLixssuKaSmRbn60f25K15xSwq3hx1OCIiTUbJj9QrPR7jjxcfxgVHBKPA3a4boYpIy1cE9Eh4XwAs3015ndz9Pncf7u7DO3fu3CiBNrZLRvUiIx7jkfeXRB2KiEiTUfIjuxWPGb85fxiXHdWL+yYu5mcvzNIFsiLSkr0IXGxmmWbWG+gPfARMBvqbWW8zyyAYFOHFCONsdJ3bZnLGId14dmoRG7dVRB2OiEiTUPIjexSLBTdCvfb4Pjw26Qt++OwMKquqow5LRAQAM3sC+AAYaGZFZna1mZ1rZkXAUcBLZvYagLvPBp4GPgVeBa539yp3rwS+A7wGzAGeDudNalce3Zst5VU8PXnpnmcWEUkCaVEHIC2DmXHTmIPIykjj9xPmU1ZRzZ0XHUpGmvJnEYmWu4+tZ9Lz9cx/G3BbHeUvAy83YGjN3sEF7RjeqwMPf7CEK44uJC2ufbqIJDft5WSvmRnf/Up/fnr6IF6auYJrHp1CabnuEC4i0pJ980t9WFqylaen6KanIpL8lPzIPvvGcX24/byDmTi/mLH3TWLN5rKoQxIRkf108uA8RhR24PcT5rFJ1/6ISJJT8iP7ZezIntx76XDmrdrE+fe8z5I1W6IOSURE9oOZ8dPTB7Nmczn3vKX7uolIcttj8mNm48xstZnNSii70Mxmm1m1mQ2vNf+PzWyhmc0zs1MSyseEZQvN7KaG3QyJwkmD83j8G0eycWsF59/zPjOWro86JBER2Q+H9GjPuYfl87d3P2NpSWnU4YiINJq9OfPzEDCmVtks4DxgYmKhmQ0mGB50SPiZv5hZ3MziwN3AqcBgYGw4r7RwR/TqwD++dTRtMuNcfN8k/jN3VdQhiYjIfvivUwYSM/jNa/OiDkVEpNHsMflx94lASa2yOe5e197xbOBJdy9z98+AhcDI8LHQ3Re7eznwZDivJIE+nbP5x7eOpm+XLL75yFSemvxF1CGJiMg+6t6+Ndcc14d/zVjO1M/XRR2OiEijaOhrfvKBxJsFFIVl9ZVLkujSthVPXnMUx/TL5Uf/mMlvX5urm6GKiLQw1x7fly5tM7n1pU9x1z5cRJJPQyc/VkeZ76Z81wWYXWNmU8xsSnFxcYMGJ40rOzONBy4fzsUjenD3m4u49rGpbC7TUNgiIi1FVmYaPzxlINO/WM/4T1ZEHY6ISINr6OSnCOiR8L4AWL6b8l24+33uPtzdh3fu3LmBw5PGlh6Pcft5B/OLMwfzn7mrOe8v7/H5Wo0EJyLSUpx/eAGDu+Vwxytz2VZRFXU4IiINqqGTnxeBi80s08x6A/2Bj4DJQH8z621mGQSDIrzYwOuWZsLMuOKY3jxy1UhWbyrj7Lvf472Fa6IOS0RE9kI8Zvz09EEsW7+Vce99FnU4IiINam+Gun4C+AAYaGZFZna1mZ1rZkXAUcBLZvYagLvPBp4GPgVeBa539yp3rwS+A7wGzAGeDueVJHZMv1xeuP4YurTN5LJxH/HQe5+pD7mISAtwdL9cThyUx1/eXETxJt3IWkSShzXnf0aHDx/uU6ZMiToMOUCbyyr5/pMf8+85q7h4RA9+cdYQWqXHow5LRPaBmU119+F7njP1JGtbtbh4MyffOZHzDy/g1xcMizocEZHd2tt2qqG7vYnsIjszjfsuPYIbvtyPJycv5fx73mfJGl0HJCLSnPXpnM3Vx/XmqSlLeXmmBj8QkeSg5EeaRCxm3HjyQP522XCK1m3ljD+9y79m1DnmhYiINBM3njSQQ3u050fPfqKDViKSFJT8SJM6cXAeL3/vOAbkZXPDE9P5yfMzNZqQiEgzlZEW489fO4xYzPj249O0vxaRFk/JjzS5/Pateerao7j2+D78/cMvOOfu91hUvDnqsEREpA4FHdrw+68ewqcrNvKr8Z9GHY6IyAFR8iORSI/H+PGpg3jwihGs2riNM//0Ls9NK9JocCIizdBXBuVx7fF9ePzDL3jh42VRhyMist+U/EikTjioCy9/7ziGdM/hB0/P4NuPT2PNZg2rKiLS3Pzw5IGMKOzAj5+bycLVOlsvIi2Tkh+JXLd2rXnim0fyozEH8cac1Zx850SNLCQi0sykx2P8aezhtEqPc/3j09harut/RKTlUfIjzUJaPMa3Rvdl/HePJb99a779+DRueGI667aURx2aiIiEurZrxR8uOpT5qzfx8xdmRR2OiMg+U/IjzcqAvLY89+2jufGkAbw6awUn3TmR12evjDosEREJfWlAZ244oR/PTC3isUmfRx2OiMg+UfIjzU56PMYNX+nPC9cfS+e2mVzz6FRueGI6qzZuizo0EREBvnfiAE4Y2JmfvTCLZ6cWRR2OiMheU/Ijzdbg7jm8cP0xfP/E/rw2eyVf/r+3+Ns7i6moqo46NBGRlBaPGfd8/QiO6ZvLfz07QyPAiUiLoeRHmrWMtBjfP3EAr3//S4zs3ZFbX5rD6Xe9wweL1kYdmohISmuVHuf+y4YzqndH/t9TH/PSJxqoRkSaPyU/0iIU5mYx7ooR3H/ZcErLqxh7/yS+96S6womIRKl1RpwHLh/BEb068N0np/PqLF2jKSLNm5IfaTHMjJMG5/HvHxzPd7/Sn1dmBV3h/vyfBZSWV0YdnohISsrKTOPBK0cyrKAdNzwxjTfmrIo6JBGRein5kRanVXqcH5wUdIU7qm8u//f6fL70m7d49IMluh5IRCQC2ZlpPHzVSAZ1y+Fbj03jrXmrow5JRKROSn6kxSrMzeJvlw/nH986ij65Wfzshdmc+Pu3eeHjZVRXe9ThiUgTMrNxZrbazGYllHU0swlmtiB87hCWm5ndZWYLzewTMzs84TOXh/MvMLPLo9iWliqnVTqPXjWKfl2yuebRqfxrxvKoQxIR2YWSH2nxjujVkaeuPZIHrxhB6/Q433vyY07/07u8OXc17kqCRFLEQ8CYWmU3AW+4e3/gjfA9wKlA//BxDXAPBMkScDMwChgJ3FyTMMneadcmnce/MYqD89txwxPTuf3lOVTpYJSINCNKfiQpmBknHNSFl797HH+46FA2l1Vw5UOTOevP7/HKzBU6EySS5Nx9IlBSq/hs4OHw9cPAOQnlj3hgEtDezLoBpwAT3L3E3dcBE9g1oZI96JCVwRPfPJKvH9mTeycu5ooHP2LdlvKowxIRAZT8SJKJxYxzDsvnjR+M5vbzDmbjtgq+9fg0TrzzbZ6espTySl0TJJJC8tx9BUD43CUszweWJsxXFJbVVy77KCMtxq3nHMyvzz+YDxeXcNbd7/Lp8o1RhyUiouRHklNGWoyxI3vynxtH86exh5GZFue/n/2E0b99kwff+4yt5VVRhygi0bE6ynw35bsuwOwaM5tiZlOKi4sbNLhkctGInjx17ZGUV1Zz3j3v8aKuAxKRiCn5kaQWjxlnHtKdl797LA9eMYL8Dq355b8+5ag73uD2V+awtKQ06hBFpPGsCruzET7XDEFWBPRImK8AWL6b8l24+33uPtzdh3fu3LnBA08mh/XswL9uOJaD89vx3Semc+v4T9lWoQNQIhINJT+SEmquCXrmuqN55rqjOKpPJ/72zmcc/9s3+eYjU3hv4RoNjiCSfF4EakZsuxx4IaH8snDUtyOBDWG3uNeAk82sQzjQwclhmRygLm1b8fg3juSyo3rxt3c/47Q/vsOHi9dGHZaIpKC0qAMQaWojCjsyorAjy9dv5bFJn/Pk5KVM+HQV/btkc9nRhZx7WD7ZmfrTEGlJzOwJYDSQa2ZFBKO23QE8bWZXA18AF4azvwycBiwESoErAdy9xMx+BUwO57vF3WsPoiD7KSMtxi1nD+WkwXn85PmZXHTfJL42qic3nXoQOa3Sow5PRFKENeej3cOHD/cpU6ZEHYYkuW0VVYz/ZAUPv7+Emcs20CYjzukHd+OrI3owvFcHzOq6DEAktZjZVHcfHnUczZHaqn1XWl7JnRPm88C7n5GbncktZw9lzNCuUYclIi3Y3rZTSn5EQu7O9KXreXryUv41Yzlbyqvok5vFhcN7cP7h+XTJaRV1iCKRUfJTP7VV+++TovX86B8zmbNiI2OGdOWXZw8hT/taEdkPSn5EDsCWskpenrmCp6csZfKSdcRjxugBnTn38Hy+clAerTPiUYco0qSU/NRPbdWBqaiq5v53FvOHfy8gbsZVxxZyzZf60q61usKJyN5T8iPSQBYXb+aZqUX8Y2oRqzeVkZUR56TBeZx9aD7H9s8lPa5xQyT5Kfmpn9qqhvH52i387vX5vDhjOe1ap/Pt0X25/OhCWqXrYJOI7JmSH5EGVlXtfPjZWl78eDmvzFrJhq0VdGiTzqkHd+OsQ7ozorAj8ZiuD5LkpOSnfmqrGtasZRv47WvzeHt+MV1zWvH9E/tzwREFpOlAk4jshpIfkUZUXlnNxPnFvDhjORM+XcXWiio6ZWVw0uA8ThnSlaP7dSIzTUcrJXko+amf2qrG8cGitfz61bl8vHQ9fTpn8Z0T+nHGsO5kpCkJEpFdKfkRaSKl5ZX8Z+5qXpu9ijfnrmZzWSXZmWmccFAXxgzpyuiBncnS0NnSwin5qZ/aqsbj7rz+6Sp+//p85q3aRNecVlx5TCFjR/XU8NgishMlPyIRKKus4v2Fa3l11komzFlFyZZyMuIxRvXpyAkDu/Dlg7pQmJsVdZgi+0zJT/3UVjU+d+et+cXcP3Ex7y9aS3ZmGheP6MGVx/Ymv33rqMMTkWZAyY9IxCqrqpny+TremLOK/8xdzaLiLQD0yc1idJgIjejdQd3jpEVQ8lM/tVVNa9ayDdz/zmLGf7ICgNMP7sYlo3oysndH3ZdNJIUp+RFpZr5YW8p/5q7iP/OKmbR4LeWV1bROjzOyd0eO65/Lsf1zGZjXVo23NEtKfuqntioay9Zv5cF3P+OpyUvZVFZJn9wsLhrRg/OPKCA3OzPq8ESkiSn5EWnGSssreX/hWt5ZUMw7C9ewODwr1LltJsf2y+XYfrkc1bcT3dWdQ5oJJT/1U1sVrdLySl76ZAVPTV7KlM/XkRYzThqcx0UjenBc/84ahVMkRTRY8mNm44AzgNXuPjQs6wg8BRQCS4Cvuvs6Cw5Z/xE4DSgFrnD3aeFnLgd+Gi72Vnd/eE/BqUGRVLF8/VbeXbCGdxau4b2FayjZUg5Aj46tObJ3J47s04kj+3ZS33aJjJKf+qmtaj4WrNrEU5OX8o9pRawrraBbu1acMawbZx7SnYPz2+nMukgSa8jk50vAZuCRhOTnN0CJu99hZjcBHdz9R2Z2GnADQfIzCviju48Kk6UpwHDAganAEe6+bnfrVoMiqai62pmzciMfLi5h0uK1fPhZCRu2VgBQ0KE1o3p3YkRhB4YXdqBv52w15tIklPzUT21V81NWWcXrs1fxwsfLeHt+MRVVTq9ObThzWHfOPKQ7A7u2jTpEEWlgDdrtzcwKgfEJyc88YLS7rzCzbsBb7j7QzO4NXz+ROF/Nw92vDct3mq8+alBEgmRo3qpNTFq8lg8WrWXykhLWlQbJUPs26RzRswPDCzsyvLADB+e3093QpVEo+amf2qrmbX1pOa/NXsm/Zqzg/UVrqHYYkJfNqUO7ceKgPIbm5+ggkkgS2Nt2an9vPpLn7isAwgSoS1ieDyxNmK8oLKuvXET2IBYzBnXLYVC3HK48pjfuzuI1W5i6ZB2Tl5Qw9fN1vDF3NQBp4byH9mgfPHq2p3enLGLq8y4iKap9mwwuGtGTi0b0pHhTGa/MWsG/Ziznrv8s4I9vLCAvJ5OvDMrjxEFdOLpvrg4giSS5hr7zYl3/YfluynddgNk1wDUAPXv2bLjIRJKEmdG3czZ9O2fz1RE9AFi7uYypn69j+tL1fPzFep6bVsSjkz4HIKdVGof0aM8hBe0Zmt+OYQXt6NaulY50ikjK6dw2k8uOKuSyowpZu7mMN+cV88acVbwwfRl///ALWqfHOaZfLl8akMvRfXPp2zlL+0qRJLO/yc8qM+uW0O1tdVheBPRImK8AWB6Wj65V/lZdC3b3+4D7IOhKsJ/xiaSUTtmZnDykKycP6QpAVbWzcPVmPl66jo+Xrmf6F+u55+1FVFUHf1KdsjI4uKAdB+e3Y2h+O4Z0zyG/fWs18vL/27uzGMmu+o7j33/ta+/bbN3j2byhYGPHcXCCCIkQEBSTCJKgRHEQCg8BiUiJEpMXFCKQeUnIQxQJAgoRCWAFiK2AktgEAiHgsNiA7fYyM56tu2d67659PXm4t6uq3T02HnqqZur+PlLp3HvuqZ7Tf3XVmf+9554rEhijmThvv+Mgb7/jIJV6g8dOr/Ko/1y2R2cvATA5EOe1R73VN+85NqZFZ0T6wJUmPw8D9wEP+OVDHfXvM7PP4S14sOEnSP8BfMTMhv12bwQ+cOXdFpGXEg4ZN05luXEqy2/9rHcFtVxrMLuwyY/nNvjRhQ2enNvgG88t4edDDCaj3LJvgFv2D3Drfq88Op4hGg718DcREbn64pEwrzsxzutOjPOhe73nsn3rlLf65jeeW+JLj88BMD2S4o6ZYV4zM8xrpoe4cTJLRN+RIteVl01+zOyzeFdtxszsAvBBvKTnQTN7N3AOeIff/Ct4K72dxFvq+l0AzrlVM/tL4Lt+uw8551b38PcQkZeRiIa5fXqY26eHW3WlaoOnFzaZXdjk6YVNnprf5DPfOUul3gQgFg5xdCLDTVNZbvKTqZumBpgciOsqkYj0renRFNOj0yvWS8IAAA9HSURBVLzzrmmaTcdzizm+dXKFx06v8M3nl1vJUCoW5tUHh3jNzBC3HRrm1v0DmlYsco3TQ05FZJt6o8mZlQJPzW8yu5DjmYubPLOQ4+JmudVmKBXlxGSWE5MZTkxmOT7hbY/qqep9S6u9XZ7GqmBxznF+tcQPzq21XrMLuda04pF0rHX1/FX7vWnFM6NpPWxV5Cq72qu9iUifioRDHJvIcmwiy723tevXi1WeuZjj2YteQvTsxRwPPT5PrlJvtRlNxzg+meHYRIZj4xmOTnjbUwM6Eyoi/cHM/CtDKd52u7dwbbFaZ9a/ev7U3CZPLWzwqf95gVrDS4gS0RDHJjKcmMhyfDLLjVMZjk9kOTCU1GqcIl2m5EdEfiJDqRh3Hxnl7iOjrTrnHBc3yzx/Kc9zl3JeuZjjoSfmyZXbSVE6FubohLdC3ZGxNEfGM9wwluaGsTTJmJaVFZHrWyoW4Y6ZEe6YGWnVVetNnl/M8dT8Js9dzPHcYp7/PbXCF/0pc977wtzQ8Z14dDzNkbEMN4ynycT1XzSRq0GfLBG5YmbGvsEk+waTvO7EeKveOcdSvsLJxTynlgqcWsxzainPY6dXWnPlt+wfTHBkPMPhsRSHR72E6PBYmkPDKWIR3UgsItenWCTErfsHuXX/4Lb6jVKNk4s5nr2Y5/nFHKeXCvzw/Dpf/tF8awEa8JblnhlJMT3iXWWaGU0xPZJmZjTFaDqmq+kiV0jJj4jsOTNjIptgIpvgtUfHth0rVuucWS5yejnP6aUCLywXOL2U33G1KGRwYDjJ4dE0h0fT/sCfYmY0zfRISleMROS6NJiM7rhKBN6KnOdWi5xeKnB6Oc+Z5QJnV4p8+/T2q0UAyWiYg8NJDgwnOTCU5OBwqrV9YCjJeDaue4xELkPJj4h0VSoW4Rb/ZuBOzjnWijVeWC5wZrnAmRUvMTqzUuCJ8+vbEiOAiWycmdEUh7bOjHa8xrNajU5Eri+JaNhfSCa741i51uDCWpFzq0XOrhQ5v1pibr3IhbUST5xfZ71Y29Y+HDIms3H2DSWZGkywfzDB1GCSqYEEkwNxJgcSjGfjJKI6iSTBo+RHRK4JZsZIOsZIOsYdM8PbjjnnWC/WOLta5OxKgXMrRc6uFjm3UuTbp7ypdJ0LV8YjIQ6NpDg4nPRfqW2lpoyIyPUkEQ23FqLZTb5SZ26txIW1IgsbZRY2Sl65Xubp+U0effpS6xEGnYZSUSazCSYG4oxn4oxlvXI8G2fML0czMYZTMV1Jkr6h5EdErnlmxnA6xnA6xm2HhnYcr9QbzK2VOLda5PxqkfNrJc6tFJlb3/2saCIaYv/Q1nQRf6rIcJL9g0n2+2dK9XBXEbleZOKR1oOtd7N1AulSrsylzQqXNsssbnZs5yq8sFxgKVfZNUkyg+GUd3JqNB1jNBNjNB33vpdTUUbSXoI0nIoxnI4ynIqRioV1kkmuSUp+ROS6F4+EOTKe4ch4ZtfjuXKNufUSF1ZLnF8rMrdWYm7de80ubLKcr25rb+ZNq9s/tJUQJfyFHRLsG/LKsYzm1IvI9aHzBNJNU5dv55wjV6mznKuwlKuwlK+wkq+yUqiykq+wWqiykq/y7MUcK4UVNko1Lve4yGjYGEzGGEpFGUpGGUpFGUzGGExGGUhGGEhEGUhGGUhE/Loo2USEbCJKJh7R96tcNUp+RKTvZRNRbpqKctPUwK7Hy7WGlwytlVjYKDG3XmZhvcT8hpccPTq7c8rItjn1AwkmBxJMDXpz6acGEkwNenWaU99bZvZ+4A8AAz7hnPuYmY0AnwcOA2eA33TOrZl3mvpvgLcAReD3nXM/6EnHRXrAzLykJBG97MmkTo2mY6NUY7VQZb1Y9csaq8UqG6Ua68UaGyWvbn69zOxCjs1Sbdvz4S4nHQuTTWwlRBEyiSjZeIRMPEIm4Zf+dioWJhOPkPbr0vEI6ViYdDxCMhrWs5RkGyU/IhJ4iWiYo+Pec4h245xjtVDl4maZixtlFjba5cJGidmLm3z92UUK1caO9w4mo0xkvaRoYiDORNa74XiiY579xECcVExfx3vNzF6Fl/jcBVSBfzezL/t1X3XOPWBm9wP3A38GvBk47r9+Dvg7vxSRXYRD7Xs1X4l6o0m+UmezVGezXGOzVGOjVCNX9vZz5br/8rcrNTaKVebWiuQrdfLl+q7ft5eTioVJxSKk417p7YdJRv2ysy4WJhX1ykTUa5+MhknGQiSi3nu2ymQsTDwS0vS+64xGWxGRl2FmjGbijGbiO57Z0SlXrnFps50cbc2l3yofO11gMVduPfW9UzoWZmIg0XGzcax10/HWjcdjfn08oqtJP6Gbge8454oAZvbfwK8D9wKv99t8Gvg6XvJzL/CPzjkHfMfMhsxsn3NuodsdF+lnkXCIoVSModQrS5o6NZuOQrVOvlKnUGlQqNQpVLz9YrXhl96xYtVLlor+sa33LeUqFKsN/1WnVGtcdhrfS0lEvcQoEQm3tuPRMImIvx0JEffLRDREPOLXRcLEoyHikRCxSLve2+4ow2Fi/n4sEiIW7jwW0pWtV0jJj4jIHvGmaEQvuyITeAP2eslLkpZyFRb9ufWLufb+7MImS/nKjuW9W/9OPMJYNt6+8TgTZ8w/+zqS8eq3bkweTseCvHjDk8CHzWwUKOFNZ/seMLmV0DjnFsxswm9/ADjf8f4Lft2O5MfM3gO8B2B6evqq/QIisrtQyFrfuXvFOUe51qRUa3ivaoNyzUuOvP065VqTsn98q22l5rUr15qU6x3btQbrpRqVWoNq3duv1JtU/O168woyrV1EQkYsEiIa3p4cRcPt+mjYq4+GzduPvGi/YzsSDhHr2G7Vh9ptI2EjGjYioa3t9vGIXx8NG+FQ+1ikVRrRUO+SNiU/IiJdFOqYJnLzvpduW641WM5XWM5XWc5VWM5XWClUW3Ur+Qpnlot8/+waq4UqlxtHBxIRvviH93Bs4uXn8PcT59ysmX0UeATIAz8EXupmg91G4l2j6pz7OPBxgDvvvHNv/gcjIj1lZiT9qW/d0Gg6qvUmlbqfFNWaVBte4lSpN6nWm1Qbfln3jlVqTWoN/3jnMX+/1vB+Zq3Rfm39rFKtwWa52XHca1tvbtU56s3mrrMTrgYziIZChP2E6BO/dyd3Hxm96v+ukh8RkWtUIhr2n02Uetm2jaZr3XC8Uugo81VWChXGMlc+veR65pz7JPBJADP7CN7VnEtb09nMbB+w6De/ABzqePtBYL6b/RWR4AiHupts/aScc9QajlqjSb3hqDX90k+m6s2tpMlRb7STpq029Wb7vY3m9vc3ms5/n/8e/2fUm47JgURXfj8lPyIifSAcat+XdLzXnbmGmNmEc27RzKaB3wB+HrgBuA94wC8f8ps/DLzPzD6Ht9DBhu73EZGgMTNiEW/KXD9S8iMiIv3sC/49PzXgvf6S1g8AD5rZu4FzwDv8tl/Buy/oJN5S1+/qRYdFROTqUfIjIiJ9yzn3i7vUrQC/vEu9A97bjX6JiEhv9Of1LBERERERkRdR8iMiIiIiIoGg5EdERERERAJByY+IiIiIiASCkh8REREREQkE8xa3uTaZ2RJw9qf8MWPA8h50px8oFm2KRZtisZ3i0dYZixnn3HgvO3Ot2oOxSn9zbYpFm2LRplhsp3i0veJx6ppOfvaCmX3POXdnr/txLVAs2hSLNsViO8WjTbHoDsW5TbFoUyzaFIvtFI+2K4mFpr2JiIiIiEggKPkREREREZFACELy8/Fed+Aaoli0KRZtisV2ikebYtEdinObYtGmWLQpFtspHm2vOBZ9f8+PiIiIiIgIBOPKj4iIiIiISP8mP2b2JjN71sxOmtn9ve5Pt5nZp8xs0cye7KgbMbNHzOx5vxzuZR+7xcwOmdnXzGzWzJ4ys/f79YGLh5klzOz/zOyHfiz+wq+/wcwe82PxeTOL9bqv3WJmYTN73Mz+zd8PZCzM7IyZ/djMnjCz7/l1gfuMdJPGKY1TWzROtWmc2knjlGevxqm+TH7MLAz8LfBm4BbgnWZ2S2971XX/ALzpRXX3A191zh0HvurvB0Ed+GPn3M3A3cB7/b+HIMajArzBOfdq4DbgTWZ2N/BR4K/9WKwB7+5hH7vt/cBsx36QY/FLzrnbOpYNDeJnpCs0TgEapzppnGrTOLWTxqm2n3qc6svkB7gLOOmcO+2cqwKfA+7tcZ+6yjn3DWD1RdX3Ap/2tz8NvK2rneoR59yCc+4H/nYO7wvkAAGMh/Pk/d2o/3LAG4B/8esDEQsAMzsI/Crw9/6+EdBYXEbgPiNdpHFK41SLxqk2jVPbaZx6Wa/4M9Kvyc8B4HzH/gW/LugmnXML4H3RAhM97k/Xmdlh4HbgMQIaD//y+RPAIvAIcApYd87V/SZB+rx8DPhToOnvjxLcWDjgP83s+2b2Hr8ukJ+RLtE4tbvA/81pnNI49SIap9r2ZJyKXMUO9pLtUqdl7QLOzDLAF4A/cs5teidPgsc51wBuM7Mh4EvAzbs1626vus/M3gosOue+b2av36repWnfx8J3j3Nu3swmgEfM7Jled6jPBflvTS5D45RH45RH49QOezJO9euVnwvAoY79g8B8j/pyLblkZvsA/HKxx/3pGjOL4g0o/+Sc+6JfHdh4ADjn1oGv480vHzKzrZMhQfm83AP8mpmdwZty9Aa8M2xBjAXOuXm/XMT7z8ZdBPwzcpVpnNpdYP/mNE7tpHFK41SnvRqn+jX5+S5w3F8NIwb8NvBwj/t0LXgYuM/fvg94qId96Rp/fuwngVnn3F91HApcPMxs3D+ThpklgV/Bm1v+NeDtfrNAxMI59wHn3EHn3GG874j/cs79DgGMhZmlzSy7tQ28EXiSAH5Gukjj1O4C+TencapN41Sbxqm2vRyn+vYhp2b2FrzsOAx8yjn34R53qavM7LPA64Ex4BLwQeBfgQeBaeAc8A7n3ItvNu07ZvYLwDeBH9OeM/vnePOpAxUPM/sZvBsCw3gnPx50zn3IzI7gnVUaAR4Hftc5V+ldT7vLn07wJ865twYxFv7v/CV/NwL8s3Puw2Y2SsA+I92kcUrj1BaNU20ap3ancWrvxqm+TX5EREREREQ69eu0NxERERERkW2U/IiIiIiISCAo+RERERERkUBQ8iMiIiIiIoGg5EdERERERAJByY+IiIiIiASCkh8REREREQkEJT8iIiIiIhII/w+GUU38MYLbAwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='perplexity', value_name='value', title_name='Perplexity')" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 1227.15234375\n", - "\tplsa – 1314.651123046875\n", - "\tsparse – 1528.09033203125\n", - "\tsparse_thetaless – 1408.190185546875\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='test_perplexity', value_name='value', title_name='Test perplexity')" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 0.9200233817100525\n", - "\tplsa – 0.7717972993850708\n", - "\tsparse – 0.9903578758239746\n", - "\tsparse_thetaless – 0.9920385479927063\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='sparsity', value_name='value', title_name='Sparsity')" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 0.7567857503890991\n", - "\tplsa – 0.6893280148506165\n", - "\tsparse – 0.8266897797584534\n", - "\tsparse_thetaless – 0.8727331757545471\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='kernel', value_name='average_contrast', title_name='Kernel Contrast')" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 0.9363216161727905\n", - "\tplsa – 0.7150622010231018\n", - "\tsparse – 0.7518076300621033\n", - "\tsparse_thetaless – 0.9405104517936707\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='kernel', value_name='average_purity', title_name='Kerne purity')" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 3574.2333984375\n", - "\tplsa – 3774.466552734375\n", - "\tsparse – 623.433349609375\n", - "\tsparse_thetaless – 621.7000122070312\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='kernel', value_name='average_size', title_name='Kernel size')" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last values:\n", - "\tthetaless – 1.2295665407796712\n", - "\tplsa – 0.7130331816199633\n", - "\tsparse – 0.6366304865617266\n", - "\tsparse_thetaless – 1.0301379151242755\n", - "\t\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
      " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plot_metric(view_name='coherence_ppmi', value_name='value', title_name='Top-20 Coherence')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Coherence evaluation on an external corpus (with Palmetto)\n", - "\n", - "Unfortunately, the service isn't available right now" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from topicnet.viewers.top_tokens_viewer import TopTokensViewer\n", - "import requests\n", - "\n", - "\n", - "def eval_top_words(name, model, num_top_tokens=10):\n", - " ttv = TopTokensViewer(model, method='blei', num_top_tokens=num_top_tokens)\n", - " output = ttv.view()\n", - " S = \"http://palmetto.aksw.org/palmetto-webapp/service/{}?words=\".format(\"umass\")\n", - " topic_score = []\n", - " for key, modalities in output.items():\n", - " if 'topic' in key:\n", - " words = list(output[key]['@default_class'].keys())\n", - " print(S + \"%20\".join(words))\n", - " result = requests.get(S + \"%20\".join(words)).text\n", - " topic_score += [float(result)]\n", - " print(round(np.mean(topic_score), 2), round(np.median(topic_score), 2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# eval_top_words('thetaless', thetaless_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# eval_top_words('plsa', plsa_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# eval_top_words('sparse', sparse_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# eval_top_words('sparse_thetaless', sparse_thetaless_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Let's look at topics" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "def print_top_words(name, model, num_top_tokens=10):\n", - " ttv = TopTokensViewer(model, method='blei', num_top_tokens=num_top_tokens)\n", - " output = ttv.view()\n", - " for key, modalities in output.items():\n", - " if 'topic' in key:\n", - " print(', '.join(output[key]['@default_class'].keys()))\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "this, have, has, any, would, know, does, been, anyone, problem\n", - "\n", - "key, system, information, public, chip, number, data, encryption, message, access\n", - "\n", - "my, me, am, car, up, .., ...., :-), buy, bike\n", - "\n", - "they, there, what, no, all, them, people, because, here, even\n", - "\n", - "god, who, jesus, life, his, man, .\", bible, christian, church\n", - "\n", - "10, 00, 15, 12, 25, game, 11, team, 20, 14\n", - "\n", - "of, are, ..., more, than, many, these, course, different, article\n", - "\n", - "is, as, an, of, not, such, the, however, be, example\n", - "\n", - "space, research, center, nasa, earth, 1993, launch, data, science, dr\n", - "\n", - "or, for, use, also, can, please, need, thanks, etc, mail\n", - "\n", - "it, but, so, one, think, could, only, why, good, really\n", - "\n", - "the, of, by, one, must, )., book, current, subject, should\n", - "\n", - "windows, drive, software, card, dos, system, disk, bit, pc, scsi\n", - "\n", - "to, be, will, the, able, order, make, time, next, of\n", - "\n", - "were, of, their, world, ,\", armenian, by, war, armenians, turkish\n", - "\n", - "graphics, ftp, edu, contact, available, runs, ->, fax, 800, pub\n", - "\n", - "for, new, best, each, used, old, price, original, 50, 100\n", - "\n", - "it, is, which, the, where, most, )., since, when, find\n", - "\n", - "the, at, into, two, first, and, second, power, end, place\n", - "\n", - "that, not, say, would, believe, being, fact, true, mean, reason\n", - "\n", - "you, if, can, do, your, how, get, want, should, re\n", - "\n", - "the, on, with, be, other, may, ),, all, and, same\n", - "\n", - "of, by, the, law, its, state, against, their, under, right\n", - "\n", - "we, --, us, our, their, who, people, government, mr, president\n", - "\n", - "file, program, );, window, */, /*, code, server, output, files\n", - "\n", - "and, in, from, the, of, over, years, through, before, after\n", - "\n", - "gm, +\", :), ah, +', mm, \\<, slave, mp, :<\n", - "\n", - "was, he, had, were, his, said, him, did, didn, she\n", - "\n", - "year, last, gun, control, by, 000, bill, local, week, national\n", - "\n", - "about, like, just, well, don, now, time, very, much, ve\n", - "\n" - ] - } - ], - "source": [ - "print_top_words('thetaless', thetaless_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "this, have, the, any, is, to, does, not, know, anyone\n", - "\n", - "is, key, of, and, the, system, information, are, public, chip\n", - "\n", - "to, me, my, or, the, on, if, have, in, would\n", - "\n", - "what, that, of, the, there, not, just, they, no, but\n", - "\n", - "he, we, god, his, that, and, us, our, him, who\n", - "\n", - "00, 10, game, 15, team, 25, 12, 11, 20, 13\n", - "\n", - "of, ..., that, to, are, is, and, with, not, the\n", - "\n", - "is, as, the, that, of, not, to, be, an, and\n", - "\n", - "of, and, space, for, 1993, research, center, in, national, by\n", - "\n", - "for, to, can, or, and, if, get, use, please, also\n", - "\n", - "it, but, have, so, that, think, the, to, really, why\n", - "\n", - "the, of, in, is, are, be, to, one, )., by\n", - "\n", - "with, windows, drive, system, card, dos, software, bit, disk, scsi\n", - "\n", - "to, is, the, on, in, this, of, with, it, can\n", - "\n", - "of, the, in, and, were, by, their, armenian, war, armenians\n", - "\n", - "from, available, on, list, ftp, and, mail, edu, )., version\n", - "\n", - "for, and, new, good, very, with, all, best, old, price\n", - "\n", - "it, is, the, in, be, too, of, an, to, are\n", - "\n", - "the, and, at, into, first, in, two, on, second, back\n", - "\n", - "that, not, to, believe, of, who, would, say, think, are\n", - "\n", - "you, your, to, it, and, do, can, if, how, are\n", - "\n", - "the, be, with, in, will, and, it, on, are, if\n", - "\n", - "of, the, by, as, to, state, in, and, israel, law\n", - "\n", - "the, to, we, and, they, that, in, their, people, --\n", - "\n", - "file, the, program, window, );, */, /*, code, server, output\n", - "\n", - "the, in, and, to, of, from, years, after, was, over\n", - "\n", - "55, ----, gm, :), __, slave, +\", master, ah, air\n", - "\n", - "was, they, he, had, were, out, said, the, when, up\n", - "\n", - "the, of, and, in, by, on, for, to, at, year\n", - "\n", - "about, it, would, but, don, well, the, just, to, there\n", - "\n" - ] - } - ], - "source": [ - "print_top_words('plsa', plsa_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "this, any, have, to, the, does, is, anyone, not, problem\n", - "\n", - "is, the, are, key, of, used, as, and, will, public\n", - "\n", - "to, or, the, have, if, on, my, for, be, in\n", - "\n", - "that, what, the, of, but, just, no, it, something, here\n", - "\n", - "we, that, me, there, know, our, they, what, us, so\n", - "\n", - "10, 00, 20, 15, 12, 25, 11, game, 14, 16\n", - "\n", - "that, is, of, are, to, not, you, this, the, and\n", - "\n", - "the, is, as, to, of, that, be, an, not, or\n", - "\n", - "and, of, for, space, on, the, data, high, in, research\n", - "\n", - "for, to, use, and, or, please, can, on, mail, also\n", - "\n", - "it, have, but, so, think, that, to, really, not, the\n", - "\n", - "the, not, of, is, god, that, jesus, be, must, as\n", - "\n", - "with, windows, thanks, drive, software, system, card, using, version, bit\n", - "\n", - "to, is, the, get, can, if, it, in, this, of\n", - "\n", - "of, the, in, and, their, by, were, people, against, armenian\n", - "\n", - "),, )., net, list, 80, contact, 22, box, runs, 60\n", - "\n", - "for, and, new, good, very, )., or, have, work, all\n", - "\n", - "is, it, the, too, an, in, be, or, are, to\n", - "\n", - "the, and, first, into, in, at, back, second, with, to\n", - "\n", - "that, to, not, who, would, be, of, are, in, believe\n", - "\n", - "you, your, to, it, can, do, and, how, if, are\n", - "\n", - "the, be, with, in, will, may, on, and, this, are\n", - "\n", - "of, the, by, as, to, state, and, in, from, which\n", - "\n", - "the, to, and, in, they, their, been, of, government, that\n", - "\n", - "the, file, program, );, window, */, /*, code, by, source\n", - "\n", - "the, in, and, to, from, of, through, where, over, up\n", - "\n", - "the, at, ..., --, in, year, will, new, last, team\n", - "\n", - "was, he, they, had, his, were, said, him, and, the\n", - "\n", - "the, and, of, in, on, by, for, two, to, gun\n", - "\n", - "it, about, my, the, on, off, to, and, well, but\n", - "\n" - ] - } - ], - "source": [ - "print_top_words('sparse', sparse_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "this, have, any, has, know, does, anyone, work, problem, would\n", - "\n", - "key, number, information, public, group, chip, news, message, encryption, posting\n", - "\n", - "my, me, like, up, am, get, could, want, on, someone\n", - "\n", - "there, what, no, don, they, some, even, see, why, here\n", - "\n", - "we, he, who, his, people, us, our, all, him, say\n", - "\n", - "10, 00, 20, 15, 12, 25, 11, game, 14, 16\n", - "\n", - "are, of, more, ..., than, many, only, case, different, course\n", - "\n", - "is, as, the, not, such, of, however, or, example, makes\n", - "\n", - "space, 1993, university, high, .,, national, research, center, science, of\n", - "\n", - "use, on, system, or, please, thanks, can, etc, mail, hard\n", - "\n", - "it, but, so, good, one, only, really, take, doesn, thing\n", - "\n", - "god, of, the, must, \"., jesus, one, not, person, word\n", - "\n", - "windows, drive, software, card, dos, version, image, bit, disk, pc\n", - "\n", - "to, the, get, on, order, way, able, when, then, time\n", - "\n", - "their, were, world, against, of, ,\", armenian, during, .\", children\n", - "\n", - "->, 100, runs, 800, 200, ----, 90, 300, steve, 500\n", - "\n", - "for, and, )., new, ),, all, best, each, both, call\n", - "\n", - "an, or, which, the, where, most, is, of, find, should\n", - "\n", - "the, into, first, and, power, second, end, place, db, three\n", - "\n", - "that, not, would, think, believe, made, fact, being, mean, be\n", - "\n", - "you, if, can, do, your, how, re, make, should, then\n", - "\n", - "with, be, the, will, also, may, on, and, other, these\n", - "\n", - "of, by, the, its, state, law, israel, rights, states, church\n", - "\n", - "the, and, --, been, to, government, those, has, them, people\n", - "\n", - "file, program, );, window, */, /*, code, available, source, ftp\n", - "\n", - "in, and, from, the, over, through, years, found, several, of\n", - "\n", - "team, win, st, series, points, mark, york, 60, 000, division\n", - "\n", - "was, they, had, were, after, when, said, didn, she, out\n", - "\n", - "at, the, on, two, and, of, year, last, control, gun\n", - "\n", - "about, out, just, well, time, on, now, very, ve, right\n", - "\n" - ] - } - ], - "source": [ - "print_top_words('sparse_thetaless', sparse_thetaless_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Last coherence values (using different number of top tokens)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The plots above show how the coherence (PPMI) of top 20 tokens evolves. In addition, we can look at the last PMI and PPMI values using different sizes of top-token list (5, 10, 20, 50, 100 words)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "occurences, co_occurences = calc_doc_occurrences(train_n_dw_matrix)\n", - "calc_pmi = create_pmi_top_function(\n", - " occurences, co_occurences,\n", - " test_n_dw_matrix.shape[0], [5, 10, 20, 50, 100],\n", - " co_occurrences_smooth=1.\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.39794082, 0.5118048 , 0.67965463, 0.88237313, 1.04378611]),\n", - " array([0.42403162, 0.5602503 , 0.71303318, 0.93598612, 1.11396004]))" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(plsa_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.91811772, 1.01885883, 1.20539864, 1.29182469, 1.36843634]),\n", - " array([0.91811772, 1.05672401, 1.22956654, 1.38632289, 1.55204473]))" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(thetaless_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.38628136, 0.48798776, 0.63637099, 0.84816317, 1.00993652]),\n", - " array([0.38628136, 0.48798776, 0.63663049, 0.86840002, 1.06419641]))" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(sparse_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.73225952, 0.86980454, 1.01066112, 1.1870601 , 1.11520662]),\n", - " array([0.73225952, 0.87038446, 1.03023163, 1.25691501, 1.40597935]))" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(sparse_thetaless_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also, let's take a look at train + test together, instead of relying on train only:" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "matrix = scipy.sparse.vstack((train_n_dw_matrix, test_n_dw_matrix))\n", - "occurences, co_occurences = calc_doc_occurrences(matrix)\n", - "calc_pmi = create_pmi_top_function(\n", - " occurences, co_occurences,\n", - " matrix.shape[0], [5, 10, 20, 50, 100],\n", - " co_occurrences_smooth=1.\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.39331874, 0.50256166, 0.6726003 , 0.86943263, 1.02683013]),\n", - " array([0.42403052, 0.56024917, 0.71303195, 0.93598439, 1.11395708]))" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(plsa_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.91811657, 1.01064248, 1.20028921, 1.27103585, 1.32921158]),\n", - " array([0.91811657, 1.05672276, 1.22956457, 1.38631923, 1.55203787]))" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(thetaless_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.38628123, 0.48798752, 0.63637053, 0.84472894, 1.00085916]),\n", - " array([0.38628123, 0.48798752, 0.63663005, 0.8683986 , 1.06419316]))" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(sparse_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([0.73225886, 0.86980354, 1.00774106, 1.17468343, 1.06144551]),\n", - " array([0.73225886, 0.87038357, 1.03023015, 1.25691137, 1.40597128]))" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "calc_pmi(sparse_thetaless_model.get_phi_dense()[0].T)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We observed several interesting things.\n", - "\n", - "1) the regularizer improves coherence and sparsity without explicitly optimizing for it\n", - "\n", - "2) The regularizer moves common but uninformative words away from \"informative\" topics. Compare the second topic of PLSA: \n", - "\n", - "`is, key, of, and, the, system, information, are, public, chip`\n", - "\n", - "with the second topic of thetaless:\n", - "\n", - "`key, system, information, public, chip, number, data, encryption, message, access`\n", - "\n", - "It should be noted that we had not specified the usual distincton between specific and background topics, the separation here is purely emergent.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/topicnet/tests/test_cube_controller.py b/topicnet/tests/test_cube_controller.py index a716757..1797e6f 100644 --- a/topicnet/tests/test_cube_controller.py +++ b/topicnet/tests/test_cube_controller.py @@ -32,7 +32,7 @@ def resource_teardown(): """ """ - dataset = Dataset(f'tests/test_data/test_dataset.csv') + dataset = Dataset('tests/test_data/test_dataset.csv') if os.path.exists("tests/experiments"): shutil.rmtree("tests/experiments") @@ -150,7 +150,7 @@ def experiment_enviroment(request): """ """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1) - dataset = Dataset(f'tests/test_data/test_dataset.csv') + dataset = Dataset('tests/test_data/test_dataset.csv') dictionary = dataset.get_dictionary() model_artm = artm.ARTM( diff --git a/topicnet/tests/test_cube_utils.py b/topicnet/tests/test_cube_utils.py index faaa7a3..3ab5e3f 100644 --- a/topicnet/tests/test_cube_utils.py +++ b/topicnet/tests/test_cube_utils.py @@ -1,6 +1,9 @@ import pytest -from topicnet.cooking_machine.cubes.controller_cube import PerplexityScoreController, ControllerAgent +from ..cooking_machine.cubes.controller_cube import ( + ControllerAgent, + PerplexityScoreController, +) DATA_REG_CONTROLLER_SORT_OF_DECREASING = [ ([246.77072143554688, diff --git a/topicnet/tests/test_cubes.py b/topicnet/tests/test_cubes.py index ceae819..f04a12d 100644 --- a/topicnet/tests/test_cubes.py +++ b/topicnet/tests/test_cubes.py @@ -10,7 +10,9 @@ from ..cooking_machine.cubes.greedy_strategy import GreedyStrategy from ..cooking_machine.cubes.perplexity_strategy import PerplexityStrategy -from ..cooking_machine.cubes import RegularizersModifierCube, CubeCreator +from ..cooking_machine.cubes import ( + RegularizersModifierCube, CubeCreator, RegularizationControllerCube +) from ..cooking_machine.models.topic_model import TopicModel from ..cooking_machine.models.topic_prior_regularizer import TopicPriorRegularizer from ..cooking_machine.models.topic_prior_regularizer import TopicPriorSampledRegularizer @@ -18,7 +20,8 @@ from ..cooking_machine.dataset import Dataset, W_DIFF_BATCHES_1 from ..cooking_machine.rel_toolbox_lite import count_vocab_size, compute_regularizer_gimel -DATA_PATH = f'tests/test_data/test_dataset.csv' + +DATA_PATH = 'tests/test_data/test_dataset.csv' MAIN_MODALITY = "@text" NGRAM_MODALITY = "@ngramms" @@ -92,11 +95,11 @@ def test_simple_experiment(experiment_enviroment, thread_flag): """ """ tm, dataset, experiment, dictionary = experiment_enviroment - TAU_GRID = [0.1, 0.5, 1, 5, 10] + tau_grid = [0.1, 0.5, 1, 5, 10] regularizer_parameters = { "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(name='test', class_ids=MAIN_MODALITY), - "tau_grid": TAU_GRID + "tau_grid": tau_grid } cube = RegularizersModifierCube( @@ -110,9 +113,9 @@ def test_simple_experiment(experiment_enviroment, thread_flag): tmodels = [dummy.restore() for dummy in dummies] - assert len(tmodels) == len(TAU_GRID) + assert len(tmodels) == len(tau_grid) for i, one_model in enumerate(tmodels): - assert one_model.regularizers['test'].tau == TAU_GRID[i] + assert one_model.regularizers['test'].tau == tau_grid[i] @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) @@ -120,11 +123,11 @@ def test_simple_experiment_pair_strategy(experiment_enviroment, thread_flag): """ """ tm, dataset, experiment, dictionary = experiment_enviroment - TAU_GRID = [0.1, 0.5, 1, 5, 10] + tau_grid = [0.1, 0.5, 1, 5, 10] regularizer_parameters = { "regularizer": artm.regularizers.SmoothSparsePhiRegularizer(name='test', class_ids=MAIN_MODALITY), - "tau_grid": TAU_GRID + "tau_grid": tau_grid } cube_pair = RegularizersModifierCube( @@ -145,7 +148,7 @@ def test_simple_experiment_pair_strategy(experiment_enviroment, thread_flag): assert len(tmodels_pair) == 5 for i, one_model in enumerate(tmodels_pair): - assert one_model.regularizers['test'].tau == TAU_GRID[i] + assert one_model.regularizers['test'].tau == tau_grid[i] @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) @@ -200,11 +203,11 @@ def test_relative_coefficients(experiment_enviroment, artm_regularizer, thread_f tm, dataset, experiment, dictionary = experiment_enviroment modality_weights = tm.class_ids random_tau = np.random.rand() + 0.01 - TAU_GRID = np.array([1.0, 2.0, 3.0, 4.0, 5.0, ]) + tau_grid = np.array([1.0, 2.0, 3.0, 4.0, 5.0, ]) regularizer_parameters = { "regularizer": artm_regularizer, - "tau_grid": TAU_GRID * random_tau + "tau_grid": tau_grid * random_tau } cube_first = RegularizersModifierCube( @@ -228,9 +231,9 @@ def test_relative_coefficients(experiment_enviroment, artm_regularizer, thread_f data_stats = count_vocab_size(dictionary, modality_weights) gimels = [] if artm_regularizer.name == 'test_decor': - gimels = TAU_GRID * random_tau + gimels = tau_grid * random_tau else: - for tau in TAU_GRID * random_tau: + for tau in tau_grid * random_tau: artm_regularizer._tau = tau gimels.append(compute_regularizer_gimel( data_stats, @@ -254,12 +257,12 @@ def test_relative_coefficients(experiment_enviroment, artm_regularizer, thread_f second_cube_models = cube_second(tm, dataset) - assert len(first_cube_tmodels) == len(TAU_GRID) == len(second_cube_models) + assert len(first_cube_tmodels) == len(tau_grid) == len(second_cube_models) if artm_regularizer.name == 'test_decor': for one_model, second_model in zip(first_cube_tmodels, second_cube_models): assert one_model.scores['PerplexityScore'] != second_model.scores['PerplexityScore'] else: - assert np.all(gimels != TAU_GRID * random_tau) + assert np.all(gimels != tau_grid * random_tau) for one_model, second_model in zip(first_cube_tmodels, second_cube_models): assert one_model.scores['PerplexityScore'] == second_model.scores['PerplexityScore'] @@ -621,11 +624,11 @@ def test_perplexity_strategy_grid(experiment_enviroment, thread_flag): """ """ tm, dataset, experiment, dictionary = experiment_enviroment - TAU_GRID = [0.1, 0.5, 1, 5, 50] + tau_grid = [0.1, 0.5, 1, 5, 50] regularizer_parameters = { "regularizer": artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=MAIN_MODALITY), - "tau_grid": TAU_GRID + "tau_grid": tau_grid } cube = RegularizersModifierCube( @@ -642,7 +645,7 @@ def test_perplexity_strategy_grid(experiment_enviroment, thread_flag): tmodels = [dummy.restore() for dummy in dummies] visited_taus = extract_visited_taus(tmodels) - expected_taus = [0] + TAU_GRID + expected_taus = [0] + tau_grid assert visited_taus == expected_taus SCORES = [3.756, 3.756, 3.753, 3.75, 3.72, 2.887] @@ -821,3 +824,119 @@ def test_phi_matrix_after_lda_sampled_regularizer(experiment_enviroment): phi_second = tm_2.get_phi() assert any(phi_first != phi_second), 'Phi matrices are the same after regularization.' + + +@pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) +@pytest.mark.parametrize('by_name', [True, False]) +def test_custom_regularizer_cubed(experiment_enviroment, thread_flag, by_name): + """ """ + _, dataset, _, dictionary = experiment_enviroment + tau_grid = [1, 0, -1] + + custom_reg = TopicPriorSampledRegularizer( + name='topic_prior', tau=5, + num_topics=5, beta_prior=[10, 1, 100, 2, 1000] + ) + + model_artm = artm.ARTM( + num_processors=1, + num_topics=5, + cache_theta=True, + class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0}, + num_document_passes=1, + dictionary=dictionary, + scores=[artm.PerplexityScore(name='PerplexityScore',)], + ) + tm = TopicModel( + model_artm, model_id='new_id_1', + custom_regularizers={custom_reg.name: custom_reg} if by_name else {} + ) + experiment = Experiment( # noqa: F841 + tm, experiment_id="cubed_reg", save_path="tests/experiments" + ) + + if by_name: + regularizer_parameters = { + "name": custom_reg.name, + "tau_grid": tau_grid + } + else: + regularizer_parameters = { + "regularizer": custom_reg, + "tau_grid": tau_grid + } + + cube = RegularizersModifierCube( + num_iter=10, + regularizer_parameters=regularizer_parameters, + reg_search="grid", + use_relative_coefficients=False, + separate_thread=thread_flag + ) + dummies = cube(tm, dataset) + + tmodels = [dummy.restore() for dummy in dummies] + + assert len(tmodels) == len(tau_grid) + for tau, one_model in zip(tau_grid, tmodels): + assert one_model.all_regularizers[custom_reg.name].tau == tau + + +@pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) +@pytest.mark.parametrize('by_name', [True, False]) +def test_custom_regularizer_cubed_controlled(experiment_enviroment, thread_flag, by_name): + """ """ + _, dataset, _, dictionary = experiment_enviroment + multiplier = 2 + initial_tau = 5 + + custom_reg = TopicPriorSampledRegularizer( + name='topic_prior', tau=initial_tau, + num_topics=5, beta_prior=[10, 1, 100, 2, 1000] + ) + + model_artm = artm.ARTM( + num_processors=1, + num_topics=5, + cache_theta=True, + class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0}, + num_document_passes=1, + dictionary=dictionary, + scores=[artm.PerplexityScore(name='PerplexityScore',)], + ) + tm = TopicModel( + model_artm, model_id='new_id_1', + custom_regularizers={custom_reg.name: custom_reg} if by_name else {} + ) + experiment = Experiment( # noqa: F841 + tm, experiment_id="cubed_controlled_reg", save_path="tests/experiments" + ) + + parameters = { + "score_to_track": None, + "tau_converter": f"prev_tau * {multiplier}", + "user_value_grid": [0.3], + "max_iters": float("inf") + } + + if by_name: + parameters["reg_name"] = custom_reg.name + else: + parameters["regularizer"] = custom_reg + + num_iter = 10 + cube = RegularizationControllerCube( + num_iter=num_iter, + parameters=parameters, + reg_search="grid", + use_relative_coefficients=False, + separate_thread=thread_flag + ) + dummies = cube(tm, dataset) + + tmodels = [dummy.restore() for dummy in dummies] + + for one_model in tmodels: + actual_tau = one_model.all_regularizers[custom_reg.name].tau + + assert actual_tau == initial_tau * (multiplier ** num_iter) diff --git a/topicnet/tests/test_dataset.py b/topicnet/tests/test_dataset.py index adbd3c1..9d6e996 100644 --- a/topicnet/tests/test_dataset.py +++ b/topicnet/tests/test_dataset.py @@ -60,7 +60,6 @@ def test_get_dict_two_times(self, small): assert len(record) == 0 - @pytest.mark.xfail @pytest.mark.parametrize("small", KEEP_DATA) def test_get_dict_two_times_alternating(self, small): """ """ diff --git a/topicnet/tests/test_experiment.py b/topicnet/tests/test_experiment.py index 5b94479..b893aca 100644 --- a/topicnet/tests/test_experiment.py +++ b/topicnet/tests/test_experiment.py @@ -173,6 +173,9 @@ def test_double_steps_experiment(two_experiment_enviroments, thread_flag): dummies_second = cube_first_2(tm_2, dataset) tmodels_lvl2_2 = [dummy.restore() for dummy in dummies_second] + for models in zip(tmodels_lvl2_1, tmodels_lvl2_2): + assert np.array_equal(models[0].get_phi(), models[1].get_phi()) + experiment_1.save_path = 'tests/experiments/test' experiment_1.save() experiment = Experiment.load('tests/experiments/test_1') @@ -210,9 +213,6 @@ def test_double_steps_experiment(two_experiment_enviroments, thread_flag): assert len(tmodels_lvl3) == len(tmodels_lvl3_2) assert cube_second.strategy.score == cube_second_2.strategy.score - for models in zip(tmodels_lvl2_1, tmodels_lvl2): - assert np.array_equal(models[0].get_phi(), models[1].get_phi()) - for models in zip(tmodels_lvl3, tmodels_lvl3_2): assert ( models[0].regularizers['test_second'].tau == models[1].regularizers['test_second'].tau diff --git a/topicnet/tests/test_experiment_restore.py b/topicnet/tests/test_experiment_restore.py new file mode 100644 index 0000000..646fd95 --- /dev/null +++ b/topicnet/tests/test_experiment_restore.py @@ -0,0 +1,295 @@ +import os +import pytest +import shutil + +from typing import ( + Dict, + List, +) + +import artm + +from ..cooking_machine.dataset import Dataset +from ..cooking_machine.experiment import Experiment +from ..cooking_machine.models.base_score import BaseScore +from ..cooking_machine.models.topic_model import TopicModel + +_MAIN_MODALITY = "@text" + +_ONE_CUBE_NUM_ITERATIONS = 10 +_NUM_CUBES = 3 +_INTERRUPT_CUBE_ITERATION = 5 +_ONCE_INTERRUPTED = [False] + +_TAU_GRID = [0.1, 0.5, 1.0] # better three or more values (for testing purposes) + +_SCORE_NAME = 'bad_score' +_REGULARIZER_NAME = 'reglurellriser' +_SELECT_CRITERION_FOR_ALL_MODELS = '' + +_DEBUG_MODE = False + + +class InterruptingScore(BaseScore): + def __init__(self, name: str, interrupt_cube: int, interrupt_tau: float): + super().__init__(name=name) + + self._iteration = 0 + self._interrupt_cube = interrupt_cube + self._interrupt_tau = interrupt_tau + + def call(self, model: TopicModel) -> float: + regularizer_tau = model.regularizers[_REGULARIZER_NAME].tau + current_cube = model.depth + current_cube_iteration = ( + len(model.scores[_SCORE_NAME]) - (current_cube - 1) * _ONE_CUBE_NUM_ITERATIONS + ) + + if (current_cube == self._interrupt_cube + and regularizer_tau == self._interrupt_tau + and current_cube_iteration >= _INTERRUPT_CUBE_ITERATION + and not _ONCE_INTERRUPTED[0]): + + _ONCE_INTERRUPTED[0] = True + + raise KeyboardInterrupt() + + self._iteration += 1 + + return self._iteration + + +class TestExperimentRestore: + dataset = None + dictionary = None + experiments_save_path = None + + @classmethod + def setup_class(cls): + cls.dataset = Dataset('tests/test_data/test_dataset.csv') + cls.dictionary = cls.dataset.get_dictionary() + cls.experiments_save_path = 'tests/experiments' + + def setup_method(self): + _ONCE_INTERRUPTED[0] = False + + os.makedirs(self.experiments_save_path, exist_ok=True) + + def teardown_method(self): + if os.path.isdir(self.experiments_save_path): + shutil.rmtree(self.experiments_save_path) + + @classmethod + def teardown_class(cls): + if os.path.isdir(cls.experiments_save_path): + shutil.rmtree(cls.experiments_save_path) + + if cls.dataset is not None: + cls.dataset.clear_folder() + + @pytest.mark.parametrize( + 'interrupt_cube_index, interrupt_model_index', + [(1, 0), (0, -1), (-1, 1)] + ) + def test_ctrl_c_and_proceed(self, interrupt_cube_index, interrupt_model_index): + self._test_ctrl_c_and_proceed( + interrupt_cube_index=interrupt_cube_index, + interrupt_model_index=interrupt_model_index, + thread_flag=False, + load_experiment=False, + ) + + # TODO: something happens in multiprocess, and it takes infinity to wait till the end + @pytest.mark.xfail + @pytest.mark.timeout(10) + @pytest.mark.parametrize( + 'interrupt_cube_index, interrupt_model_index', + [(1, 0)] # , (0, -1), (-1, 1)] + ) + def test_ctrl_c_and_proceed_multiprocess(self, interrupt_cube_index, interrupt_model_index): + self._test_ctrl_c_and_proceed( + interrupt_cube_index=interrupt_cube_index, + interrupt_model_index=interrupt_model_index, + thread_flag=True, + load_experiment=False, + ) + + # TODO: cubes are loaded as strings, not as Python objects -> experiment.run fails + @pytest.mark.xfail + @pytest.mark.parametrize( + 'interrupt_cube_index, interrupt_model_index', + [(1, 0), (0, -1), (-1, 1)] + ) + def test_ctrl_c_and_load(self, interrupt_cube_index, interrupt_model_index): + self._test_ctrl_c_and_proceed( + interrupt_cube_index=interrupt_cube_index, + interrupt_model_index=interrupt_model_index, + thread_flag=False, + load_experiment=True, + ) + + @pytest.mark.xfail + @pytest.mark.timeout(10) + @pytest.mark.parametrize( + 'interrupt_cube_index, interrupt_model_index', + [(1, 0)] # , (0, -1), (-1, 1)] + ) + def test_ctrl_c_and_load_multiprocess(self, interrupt_cube_index, interrupt_model_index): + self._test_ctrl_c_and_proceed( + interrupt_cube_index=interrupt_cube_index, + interrupt_model_index=interrupt_model_index, + thread_flag=True, + load_experiment=True, + ) + + def _test_ctrl_c_and_proceed( + self, + interrupt_cube_index: int, + interrupt_model_index: int, + thread_flag: bool, + load_experiment: bool) -> None: + + experiment = self._initialize_experiment( + experiment_id=f'Experiment_{thread_flag}', + interrupt_cube_index=interrupt_cube_index, + interrupt_model_index=interrupt_model_index, + ) + cube_settings = self._initialize_cube_settings(thread_flag) + experiment.build(cube_settings) + + models: List[TopicModel] = None + is_interrupt_detected = False + + try: + experiment.run( + self.dataset, verbose=False, nb_verbose=False + ) + except KeyboardInterrupt: + is_interrupt_detected = True + + if load_experiment: + experiment = Experiment.load( + os.path.join(experiment.save_path, experiment.experiment_id) + ) # TODO: need to concatenate? + + models = experiment.run( + self.dataset, verbose=False, nb_verbose=False, + restore_mode=True, + ) + finally: + self._print_debug_info(experiment) + + assert is_interrupt_detected, 'No KeyboardInterrupt detected!' + + self._check_result(cube_settings, experiment, models) + + def _initialize_experiment( + self, + experiment_id: str, + interrupt_cube_index: int, + interrupt_model_index: int) -> Experiment: + + artm_model = artm.ARTM( + num_processors=1, + num_topics=5, + cache_theta=True, + num_document_passes=1, + dictionary=self.dictionary, + scores=[ + artm.PerplexityScore( + name='PerplexityScore' + ), + artm.SparsityPhiScore( + name='SparsityPhiScore', class_id=_MAIN_MODALITY + ) + ] + ) + + topic_model = TopicModel(artm_model, model_id='start_id') + interrupt_cube = list(range(_NUM_CUBES))[interrupt_cube_index] + 1 + interrupt_tau = _TAU_GRID[interrupt_model_index] + topic_model.scores.add( + InterruptingScore( + name=_SCORE_NAME, + interrupt_cube=interrupt_cube, + interrupt_tau=interrupt_tau, + ) + ) + + return Experiment( + topic_model, + experiment_id=experiment_id, + save_path=self.experiments_save_path, + ) + + def _initialize_cube_settings(self, separate_thread: bool) -> List[Dict]: + return [ + self._one_cube_description( + num_iter=_ONE_CUBE_NUM_ITERATIONS, + separate_thread=separate_thread, + ) + for _ in range(_NUM_CUBES) + ] + + def _one_cube_description(self, num_iter: int, separate_thread: bool) -> dict: + return { + 'RegularizersModifierCube': + { + 'num_iter': num_iter, + 'regularizer_parameters': + { + "regularizer": artm.regularizers.SmoothSparsePhiRegularizer( + name=_REGULARIZER_NAME + ), + "tau_grid": _TAU_GRID, + }, + 'reg_search': 'grid', + 'use_relative_coefficients': False, + 'separate_thread': separate_thread, + }, + 'selection': [_SELECT_CRITERION_FOR_ALL_MODELS] + } + + def _check_result( + self, + cube_settings: List[Dict], + experiment: Experiment, + models: List[TopicModel]) -> None: + + assert experiment.depth > 0 + assert len(models) == len(_TAU_GRID) ** (experiment.depth - 1) + assert experiment.depth == len(experiment.cubes) + assert experiment.depth == len(cube_settings) + 1 + + assert len(experiment.get_models_by_depth(0)) == 0 + assert len(experiment.get_models_by_depth(1)) == 1 + assert len(experiment.get_models_by_depth(2)) == 1 * len(_TAU_GRID) + + for d in range(3, experiment.depth + 1): + assert len(experiment.get_models_by_depth(d)) == len(_TAU_GRID) ** (d - 1) + + assert len(experiment.models) == sum( + len(_TAU_GRID) ** (d - 1) for d in range(1, experiment.depth + 1) + ) + + def _print_debug_info(self, experiment: Experiment) -> None: + if not _DEBUG_MODE: + return + + print(f'Experiment save path: {experiment.save_path}') + print(f'Experiment depth: {experiment.depth}') + print(f'Num cubes: {len(experiment.cubes)}') + + print('Cubes:' + '\n') + + for c in experiment.cubes: + print(c) + print() + + last_model = list(experiment.models.values())[-1] + score_names = last_model.scores.keys() + + print(score_names) + + if _SCORE_NAME in score_names: + print(last_model.scores[_SCORE_NAME]) diff --git a/topicnet/tests/test_experiment_select.py b/topicnet/tests/test_experiment_select.py index 094049a..f18c252 100644 --- a/topicnet/tests/test_experiment_select.py +++ b/topicnet/tests/test_experiment_select.py @@ -148,7 +148,8 @@ def set_score(self, name: str, values: list): @staticmethod def get_start_model(): - model = MockTopicModel(name=f'<<< Start Model >>>', depth=0) + # TODO: can we rename it because of Win compatibility? + model = MockTopicModel(name='<<< Start Model >>>', depth=0) for score in SCORES: model.set_score(score, []) @@ -1082,7 +1083,7 @@ def test_constrained_optimization( def test_constraints_on_same_attribute_contradict(self, parameter, threshold, signs): experiment = TestExperimentSelect.get_experiment() - constraint_template = f'{{0}} {{1}} {{2}}' + constraint_template = f'{{0}} {{1}} {{2}}' # noqa: F541 query = combine_constraints( *[constraint_template.format(parameter, sign, threshold) for sign in signs] diff --git a/topicnet/tests/test_pipeline.py b/topicnet/tests/test_pipeline.py index 9014860..01fa060 100644 --- a/topicnet/tests/test_pipeline.py +++ b/topicnet/tests/test_pipeline.py @@ -13,9 +13,10 @@ from ..cooking_machine.dataset import Dataset, W_DIFF_BATCHES_1 from ..cooking_machine.config_parser import build_experiment_environment_from_yaml_config from ..cooking_machine.model_tracking import START +from ..cooking_machine.recipes import BaselineRecipe -# MULTIPROCESSING_FLAGS = [True, False] -MULTIPROCESSING_FLAGS = [True] +# TODO: MULTIPROCESSING_FLAGS = [True, False] +MULTIPROCESSING_FLAGS = [False] USE_MULTIPROCESS = True @@ -96,7 +97,7 @@ def experiment_enviroment(request): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_bad_empty_config(experiment_enviroment, thread_flag): +def test_bad_empty_config(thread_flag): with open("tests/test_data/bad_empty_config.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -113,7 +114,7 @@ def test_bad_empty_config(experiment_enviroment, thread_flag): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_bad_config(experiment_enviroment, thread_flag): +def test_bad_config(thread_flag): with open("tests/test_data/bad_config.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -127,7 +128,7 @@ def test_bad_config(experiment_enviroment, thread_flag): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_pipeline_from_config(experiment_enviroment, thread_flag): +def test_pipeline_from_config(thread_flag): with open("tests/test_data/config.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -188,7 +189,7 @@ def test_pipeline_from_config(experiment_enviroment, thread_flag): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_config_with_blei_score(experiment_enviroment, thread_flag): +def test_config_with_blei_score(thread_flag): with open("tests/test_data/config_blei.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -207,7 +208,7 @@ def test_config_with_blei_score(experiment_enviroment, thread_flag): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_config_with_scores(experiment_enviroment, thread_flag): +def test_config_with_scores(thread_flag): with open("tests/test_data/config_short.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -227,7 +228,7 @@ def test_config_with_scores(experiment_enviroment, thread_flag): @pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) -def test_config_with_greedy_strategy(experiment_enviroment, thread_flag): +def test_config_with_greedy_strategy(thread_flag): with open("tests/test_data/config2.yml", "r", encoding='utf-8') as f: yaml_string = f.read() @@ -286,3 +287,29 @@ def test_pipeline_with_new_cube_after(experiment_enviroment, thread_flag): assert len(new_models) == 3, 'Incorrect number of final models.' assert len(experiment.cubes) == 4, 'Incorrect number of cubes in the experiment.' assert len(experiment.criteria) == 4, 'Incorrect number of criteria in the experiment.' + + +@pytest.mark.parametrize('thread_flag', MULTIPROCESSING_FLAGS) +def test_filter_dictionary(thread_flag): + datasets = dict() + big_dataset_name = 'big' + small_dataset_name = 'small' + + for dataset_name, min_df in [(big_dataset_name, 0), (small_dataset_name, 2)]: + pipeline = BaselineRecipe() + pipeline.format_recipe( + dataset_path='./tests/test_data/test_dataset.csv', + dictionary_filter_parameters={'min_df': min_df}, + ) + _, dataset = pipeline.build_experiment_environment( + experiment_id=dataset_name, + save_path='tests/experiments', + ) + datasets[dataset_name] = dataset + + big_dictionary = datasets[big_dataset_name].get_dictionary() + small_dictionary = datasets[small_dataset_name].get_dictionary() + big_num_entries = Dataset._get_dictionary_num_entries(big_dictionary) + small_num_entries = Dataset._get_dictionary_num_entries(small_dictionary) + + assert big_num_entries > small_num_entries diff --git a/topicnet/tests/test_topic_model.py b/topicnet/tests/test_topic_model.py index b99d07b..65659ae 100644 --- a/topicnet/tests/test_topic_model.py +++ b/topicnet/tests/test_topic_model.py @@ -1,6 +1,10 @@ +import datetime import pytest import warnings import shutil +import time + + import artm from ..cooking_machine.models.dummy_topic_model import DummyTopicModel @@ -9,6 +13,7 @@ from ..cooking_machine.dataset import Dataset, W_DIFF_BATCHES_1 from ..cooking_machine.models.example_score import ScoreExample from ..cooking_machine.models.blei_lafferty_score import BleiLaffertyScore +from ..cooking_machine.models import BaseScore ARTM_NINE = artm.version().split(".")[1] == "9" MAIN_MODALITY = "@text" @@ -230,3 +235,144 @@ def test_to_dummy_and_back_with_scores(experiment_enviroment): assert len(restored_topic_model.scores[custom_score_name]) == num_iterations assert len(restored_topic_model.scores[artm_score_name]) == num_iterations + + +@pytest.mark.parametrize( + 'should_compute', + [False, True, None, lambda i: False, lambda i: True, lambda i: i == 2] +) +def test_should_compute(experiment_enviroment, should_compute): + topic_model, dataset, experiment, dictionary = experiment_enviroment + + score_name = 'blei' + topic_model.scores.add( + BleiLaffertyScore( + name=score_name, + should_compute=should_compute, + ) + ) + score_should_compute = topic_model.custom_scores[score_name]._should_compute + + num_iters = 20 + last_iter = num_iters - 1 + + score_num_iters = sum( + 1 * [score_should_compute(i) for i in range(num_iters)] + ) + + if not score_should_compute(last_iter): + score_num_iters = score_num_iters + 1 + + topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iters) + model_scores = topic_model.scores + + assert len(model_scores[score_name]) == score_num_iters + + +def test_compute_on_custom_iterations(experiment_enviroment): + topic_model, dataset, experiment, dictionary = experiment_enviroment + + score_name_a = 'blei' + score_should_compute_a = lambda iter: iter == 5 # noqa E731 + topic_model.scores.add( + BleiLaffertyScore( + name=score_name_a, + should_compute=score_should_compute_a, + ) + ) + + score_name_b = 'perp' + score_should_compute_b = lambda iter: iter % 2 == 0 # noqa E731 + topic_model.scores.add( + BleiLaffertyScore( + name=score_name_b, + should_compute=score_should_compute_b, + ) + ) + + num_iters = 20 + last_iter = num_iters - 1 + + score_num_iters_a = sum( + 1 * [score_should_compute_a(i) for i in range(num_iters)] + ) + score_num_iters_b = sum( + 1 * [score_should_compute_b(i) for i in range(num_iters)] + ) + + if not score_should_compute_a(last_iter): + score_num_iters_a = score_num_iters_a + 1 + if not score_should_compute_b(last_iter): + score_num_iters_b = score_num_iters_b + 1 + + topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iters) + model_scores = topic_model.scores + + assert len(model_scores[score_name_a]) == score_num_iters_a + assert len(model_scores[score_name_b]) == score_num_iters_b + + +def test_precomputed(experiment_enviroment): + topic_model, dataset, experiment, dictionary = experiment_enviroment + + class SlowScore(BaseScore): + def __init__(self, name): + super().__init__(name=name) + + self._data_key = 'some_precomputed_data_key' + + def call(self, model: TopicModel, **kwargs): + precomputed_data = kwargs.get( + BaseScore._PRECOMPUTED_DATA_PARAMETER_NAME, dict() + ) + + if self._data_key in precomputed_data: + return 0 + + time.sleep(1) + + precomputed_data[self._data_key] = 0 + + return 1 + + topic_model.scores.add(SlowScore(name='slow_score')) + + num_iters = 5 + start_time = datetime.datetime.now() + + topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iters) + + middle_time = datetime.datetime.now() + time_for_one_score = (middle_time - start_time).total_seconds() + + topic_model.scores.add(SlowScore(name='score_b')) + topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iters) + + end_time = datetime.datetime.now() + time_for_two_scores = (end_time - middle_time).total_seconds() + + assert time_for_two_scores / time_for_one_score < 1.1 + + +def test_score_with_no_precomputed_for_compatibility(experiment_enviroment): + topic_model, dataset, experiment, dictionary = experiment_enviroment + + class ScoreWithNoKwargs(BaseScore): + return_value = 3 + + def __init__(self, name): + super().__init__(name=name) + + def call(self, model: TopicModel): + time.sleep(0.01) + + return self.return_value + + score_name = 'score_without_precomputed' + topic_model.scores.add(ScoreWithNoKwargs(name=score_name)) + + num_iters = 5 + topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_iters) + + assert len(topic_model.scores[score_name]) == num_iters + assert all(v == ScoreWithNoKwargs.return_value for v in topic_model.scores[score_name]) From e41cfed905df3f1a8f4b4d68e5a795acb09fb23f Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 00:43:44 +0300 Subject: [PATCH 3/9] change travis file: py v, run tessts command --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8b9f63f..61faee8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ cache: - $HOME/.ccache python: - - "3.6" + - "3.7" env: global: @@ -31,7 +31,7 @@ install: script: - cd topicnet - - py.test . --timeout=60 --cov + - py.test tests --timeout=45 --reruns 3 --reruns-delay 3 --cov after_success: - codecov From de3d87dc32673fee61d3c51b19f89b07118985c3 Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 00:45:05 +0300 Subject: [PATCH 4/9] add new reqs --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 10744c4..0b0e970 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,12 +6,14 @@ dask[dataframe] dill ipython jinja2 +numba numexpr numpy pandas plotly pytest pytest-cov +pytest-rerunfailures pytest-timeout scikit-learn scipy From 96099309fbcb303e95a0584cbd9bad36bcd2ec0f Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 00:47:22 +0300 Subject: [PATCH 5/9] modify setup: reqs, version --- setup.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 03087dc..9207168 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,8 @@ setup( - name = 'topicnet', - packages = [ + name='topicnet', + packages=[ 'topicnet', 'topicnet.cooking_machine', 'topicnet.cooking_machine.cubes', @@ -12,14 +12,14 @@ 'topicnet.dataset_manager', 'topicnet.viewers' ], - version = '0.7.1', + version='0.8.0', license='MIT', - description = 'TopicNet is a module for topic modelling using ARTM algorithm', - author = 'Machine Intelligence Laboratory', - author_email = 'alex.goncharov@phystech.edu', - url = 'https://github.com/machine-intelligence-laboratory/TopicNet', - download_url = 'https://github.com/machine-intelligence-laboratory/TopicNet/archive/v0.7.1.tar.gz', - keywords = [ + description='TopicNet is a module for topic modelling using ARTM algorithm', + author='Machine Intelligence Laboratory', + author_email='alex.goncharov@phystech.edu', + url='https://github.com/machine-intelligence-laboratory/TopicNet', + download_url='https://github.com/machine-intelligence-laboratory/TopicNet/archive/v0.8.0.tar.gz', + keywords=[ 'ARTM', 'topic modeling', 'regularization', @@ -33,7 +33,7 @@ 'dill', 'ipython', 'jinja2', - 'numba', + 'numba', 'numexpr', 'numpy', 'pandas', @@ -52,6 +52,6 @@ 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Information Analysis', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], ) From 91fef51225d00deb797c545c6031612ab54e02bd Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 00:48:14 +0300 Subject: [PATCH 6/9] add a couple of commas in setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9207168..99c510d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ 'topicnet.cooking_machine.models', 'topicnet.cooking_machine.recipes', 'topicnet.dataset_manager', - 'topicnet.viewers' + 'topicnet.viewers', ], version='0.8.0', license='MIT', @@ -24,7 +24,7 @@ 'topic modeling', 'regularization', 'multimodal learning', - 'document vector representation' + 'document vector representation', ], install_requires=[ 'bigartm', From 48b1c7ab132eea024fe02892256763eb3e264b7e Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 01:06:20 +0300 Subject: [PATCH 7/9] add kostyl-run tests command in travis to speed up --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 61faee8..8a82598 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,8 +31,8 @@ install: script: - cd topicnet - - py.test tests --timeout=45 --reruns 3 --reruns-delay 3 --cov + - py.test tests -k "not tests/test_pipeline.py" --timeout=45 --reruns 3 --reruns-delay 3 --cov + # TODO: - py.test tests --timeout=45 --reruns 3 --reruns-delay 3 --cov after_success: - codecov - From 147c3ab983a6672f45f93d76d294b627ac42f8ff Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 01:25:02 +0300 Subject: [PATCH 8/9] add section about contribution --- README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7b8c06a..b194ee2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -

      TopicNet

      +

      TopicNet

      @@ -31,9 +31,9 @@ It aims at automating model training routine freeing more time for artistic proc Consider using TopicNet if: -* you want to explore BigARTM functionality without writing an overhead. -* you need help with rapid solution prototyping. -* you want to build a good topic model quickly (out-of-box, with default parameters). +* you want to explore BigARTM functionality without writing an overhead; +* you need help with rapid solution prototyping; +* you want to build a good topic model quickly (out-of-box, with default parameters); * you have an ARTM model at hand and you want to explore it's topics. `TopicNet` provides an infrastructure for your prototyping with the help of `Experiment` class and helps to observe results of your actions via [`viewers`](topicnet/viewers) module. @@ -351,7 +351,20 @@ parameters:[ ] ``` +# Contribution + +If you find a bug, or if you would like the library to have some new features — you are welcome to contact us or create an issue or a pull request! + +It also worth noting that TopicNet library is always open to improvements in several areas: + +* New custom regularizers. +* New topic model scores. +* New topic models or recipes to train topic models for a particular task/with some special properties. +* New datasets (so as to make them available for everyone to download and conduct experiments with topic models). + + # Citing TopicNet + When citing `topicnet` in academic papers and theses, please use this BibTeX entry: ``` From 1d005304b4362a856eb20ebb515efed7eb47840c Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Tue, 7 Jul 2020 02:05:28 +0300 Subject: [PATCH 9/9] [ci skip] fix link in demos readme --- topicnet/demos/README.md | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/topicnet/demos/README.md b/topicnet/demos/README.md index a33f51a..384785f 100644 --- a/topicnet/demos/README.md +++ b/topicnet/demos/README.md @@ -1,33 +1,37 @@ -# Demo +# Demos + This section provides demonstrations of how to use this library in NLP tasks. -1. [RTL-Wiki-Preprocessing](RTL-Wiki-Preprocessing.ipynb) -- notebook working with a dataset introduced in [1]. It serves as an example of a typical preprocessing pipeline: getting a dataset, lemmatizing it, extracting n-grams/collocations, writing data in VW format +1. [RTL-Wiki-Preprocessing](RTL-Wiki-Preprocessing.ipynb) — notebook working with a dataset introduced in [1]. It serves as an example of a typical preprocessing pipeline: getting a dataset, lemmatizing it, extracting n-grams/collocations, writing data in VW format + +2. [RTL-Wiki-Building-Topic-Mode](RTL-Wiki-Building-Topic-Model.ipynb) — notebook with first steps to build topic model by consequently tuning its hyperparameters -2. [RTL-Wiki-Building-Topic-Mode](RTL-Wiki-Building-Topic-Model.ipynb) -- notebook with first steps to build topic model by consequently tuning its hyperparameters +3. [Visualizing-Your-Model-Documents](Visualizing-Your-Model-Documents.ipynb) — notebook providing a fresh outlook on unstructured document collection with the help of a topic model -3. [Visualizing-Your-Model-Documents](Visualizing-Your-Model-Documents.ipynb) -- notebook providing a fresh outlook on unstructured document collection with the help of a topic model +4. [20NG-Preprocessing](20NG-Preprocessing.ipynb) — preparing data from a well-know 20 Newsgroups dataset -4. [20NG-Preprocessing](20NG-Preprocessing.ipynb) -- preparing data from a well-know 20 Newsgroups dataset +5. [20NG-GenSim-vs-TopicNet](20NG-GenSim-vs-TopicNet.ipynb) — a comparison between two topic models build by Gensim and TopicNet library. In the notebook we compare model topics by calculating their UMass coherence measure with the help of [Palmetto](https://palmetto.demos.dice-research.org/) and using Jaccard measure to compare topic top-tokens diversity -5. [20NG-GenSim-vs-TopicNet](20NG-GenSim-vs-TopicNet.ipynb) -- a comparison between two topic models build by Gensim and TopicNet library. In the notebook we compare model topics by calculating their UMass coherence measure with the help of [Palmetto](https://palmetto.demos.dice-research.org/) and using Jaccard measure to compare topic top-tokens diversity +6. [PostNauka-Building-Topic-Model](PostNauka-Building-Topic-Model.ipynb) — an analog of the RTL-Wiki notebook performed on the corpus of Russian pop-science articles given by postnauka.ru -6. [PostNauka-Building-Topic-Model](PostNauka-Building-Topic-Model.ipynb)-- an analog of the RTL-Wiki notebook performed on the corpus of Russian pop-science articles given by postnauka.ru +7. [PostNauka-Recipe](PostNauka-Recipe.ipynb) — a demonstration of rapid-prototyping methods provided by the library -7. [PostNauka-Recipe](PostNauka-Recipe.ipynb) -- a demonstration of rapid-prototyping methods provided by the library +8. [Coherence-Maximization-Recipe](Coherence-Maximization-Recipe.ipynb) — a recipe for hyperparameter search in regard to custom Coherence metric -8. [Coherence-Maximization-Recipe](Coherence-Maximization-Recipe.ipynb) -- a recipe for hyperparameter search in regard to custom Coherence metric +9. [Topic-Prior-Regularizer-Tutorial](Topic-Prior-Regularizer-Tutorial.ipynb) — a demonstration of the approach to learning topics from the unbalanced corpus -9. [Topic-Prior-Regularizer-Tutorial](Topic-Prior-Regularizer-Tutorial.ipynb) -- a demonstration of the approach to learning topics from the unbalanced corpus +10. [Making-Decorrelation-and-Topic-Selection-Friends](Making-Decorrelation-and-Topic-Selection-Friends.ipynb) — reproduction of a very complicated experiment on automatically learning optimal number of topics from the collection. Hurdle is -- both needed regularizers when working together nullify token-topic matrix. -10. [Making-Decorrelation-and-Topic-Selection-Friends](Making-Decorrelation-and-Topic-Selection-Friends.ipynb) -- reproduction of a very complicated experiment on automatically learning optimal number of topics from the collection. Hurdle is -- both needed regularizers when working together nullify token-topic matrix. +11. [Topic-Thetaless-Regularizer](Topic-Thetaless-Regularizer.ipynb) — the Additive Regularization of Topic Models (ARTM) formalism views the topic modeling task as an optimization problem and leverages various computational and optimization tricks to make it converge faster and more predictably. This is an example of such optimization (let's reduce the number of inferred parameters), interpreted as a powerful custom regularizer. -11. [Topic-Thetaless-Regularizer](topic_thetaless_regularizer.ipynb) -- The Additive Regularization of Topic Models (ARTM) formalism views the topic modeling task as an optimization problem and leverages various computational and optimization tricks to make it converge faster and more predictably. This is an example of such optimization (let's reduce the number of inferred parameters), interpreted as a powerful custom regularizer. +## References ----- [1](https://dl.acm.org/doi/10.5555/2984093.2984126) Jonathan Chang, Jordan Boyd-Graber, Sean Gerrish, Chong Wang, and David M. Blei. 2009. Reading tea leaves: how humans interpret topic models. In Proceedings of the 22nd International Conference on Neural Information Processing Systems (NIPS’09). Curran Associates Inc., Red Hook, NY, USA, 288–296. ----- -P.S. All the guides are supposed to contain **working** examples of the library code. + +## P.S. + +All the guides are supposed to contain **working** examples of the library code. If you happen to find code that is no longer works, please write about it in the library issues. We will try to resolve it as soon as possible and plan to include fixes in the nearest releases.