From b6d8b40f8aeac51abaf8aa616ca6f0862172cc95 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Wed, 19 Feb 2025 07:58:31 +0100 Subject: [PATCH] save pseudodata for each replica in the right folder ; add complicated reportengine wrapper ; remove complicated validphys functio --- n3fit/src/n3fit/scripts/n3fit_exec.py | 13 ++- n3fit/src/n3fit/stopping.py | 4 +- validphys2/src/validphys/n3fit_data.py | 111 +++++++++++++++++-------- 3 files changed, 84 insertions(+), 44 deletions(-) diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index 32a8420c48..2175460071 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -127,17 +127,14 @@ def from_yaml(cls, o, *args, **kwargs): if fps := file_content["fitting"].get("savepseudodata", True): if fps != True: raise TypeError(f"fitting::savepseudodata is neither True nor False ({fps})") - #if len(kwargs["environment"].replicas) != 1: - #raise ConfigError( - # "Cannot request that multiple replicas are fitted and that " - # "pseudodata is saved. Either set `fitting::savepseudodata` " - # "to `false` or fit replicas one at a time." - #) - # take same namespace configuration on the pseudodata_table action. + training_action = namespace + "replicas_training_pseudodata" validation_action = namespace + "replicas_validation_pseudodata" + all_data_action = namespace + "replicas_pseudodata" - N3FIT_FIXED_CONFIG['actions_'].extend((training_action, validation_action)) + N3FIT_FIXED_CONFIG['actions_'].extend( + (training_action, validation_action, all_data_action) + ) if thconfig := file_content.get('fiatlux'): N3FIT_FIXED_CONFIG['fiatlux'] = thconfig diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 55efb9d78b..99be8f45e7 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -69,9 +69,9 @@ def parse_ndata(all_data): if dictionary.get("count_chi2"): tr_ndata = dictionary["ndata"] vl_ndata = dictionary["ndata_vl"] - if tr_ndata: + if sum(tr_ndata) != 0: tr_ndata_dict[exp_name] = np.array(tr_ndata) - if vl_ndata: + if sum(vl_ndata) != 0: vl_ndata_dict[exp_name] = np.array(vl_ndata) if dictionary.get("positivity") and not dictionary.get("integrability"): pos_set.append(exp_name) diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index daadc26e36..29c79a1277 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -5,7 +5,8 @@ :py:func:`n3fit.performfit.performfit`. """ -from collections import defaultdict +from collections import abc, defaultdict +from copy import copy import functools import hashlib import logging @@ -13,7 +14,7 @@ import numpy as np import pandas as pd -from reportengine import collect +from reportengine import collect, namespaces from reportengine.table import table from validphys.core import IntegrabilitySetSpec, TupleComp from validphys.n3fit_data_utils import validphys_group_extractor @@ -21,6 +22,49 @@ log = logging.getLogger(__name__) +def _per_replica(f): + """Decorator to be used on top of reportengine's decorator. + It replaces the preparation step of the decorator with a custom function, + which modifies the output behaviour when there is a collection of replicas. + + If there is no ``replica_path`` in the environment or collection over replicas + this function does nothing. Otherwise, it removes the replica number from the + output file and directs the output to ``replica_`` instead. + """ + original_prepare = f.prepare + + def prepare_replica_path(*, spec, namespace, environment, **kwargs): + if not hasattr(environment, "replica_path") or "replicas" not in namespace: + return original_prepare(spec=spec, namespace=namespace, environment=environment) + + if not isinstance(namespace["replicas"], abc.Collection): + return original_prepare(spec=spec, namespace=namespace, environment=environment) + + # Now loop over the function input to get the replica collection] + # which we will then remove + rnumber = None + new_nsspec = [] + for farg in spec.nsspec: + if isinstance(farg, abc.Collection) and farg[0] == "replicas": + rnumber = namespaces.value_from_spcec_ele(namespace, farg) + else: + new_nsspec.append(farg) + if rnumber is None: + raise ValueError("Wrong call to @_replica_table") + + replica_path = environment.replica_path / f"replica_{rnumber}" + + new_env = copy(environment) + new_env.table_folder = replica_path + new_spec = spec._replace(nsspec=tuple(new_nsspec)) + + return original_prepare(spec=new_spec, namespace=namespace, environment=new_env) + + f.prepare = prepare_replica_path + + return f + + def replica_trvlseed(replica, trvlseed, same_trvl_per_replica=False): """Generates the ``trvlseed`` for a ``replica``.""" # TODO: move to the new infrastructure @@ -74,7 +118,7 @@ def __iter__(self): yield from self.masks -def tr_masks(data, replica_trvlseed, parallel_models=False, replica=1, replicas=(1,)): +def tr_masks(data, replica_trvlseed): """Generate the boolean masks used to split data into training and validation points. Returns a list of 1-D boolean arrays, one for each dataset. Each array has length equal to N_data, the datapoints which @@ -374,50 +418,48 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn """ return (replica, exps_fitting_data_dict, replica_nnseed) + replicas_training_pseudodata = collect("training_pseudodata", ("replicas",)) replicas_validation_pseudodata = collect("validation_pseudodata", ("replicas",)) +replicas_pseudodata = collect("pseudodata_table", ("replicas",)) replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",)) groups_replicas_indexed_make_replica = collect( "indexed_make_replica", ("replicas", "group_dataset_inputs_by_experiment") ) +experiment_indexed_make_replica = collect( + "indexed_make_replica", ("group_dataset_inputs_by_experiment",) +) -@table -def pseudodata_table(groups_replicas_indexed_make_replica, replicas): - """Creates a pandas DataFrame containing the generated pseudodata. The - index is :py:func:`validphys.results.experiments_index` and the columns - are the replica numbers. +def replica_pseudodata(experiment_indexed_make_replica, replica): + """Creates a pandas DataFrame containing the generated pseudodata. + The index is :py:func:`validphys.results.experiments_index` and the columns + is the replica numbers. Notes ----- Whilst running ``n3fit``, this action will only be called if - `fitting::savepseudodata` is `true` (as per the default setting) and - replicas are fitted one at a time. The table can be found in the replica - folder i.e. /nnfit/replica_*/ + `fitting::savepseudodata` is `true` (as per the default setting) + The table can be found in the replica folder i.e. /nnfit/replica_*/ """ - # groups_replicas_indexed_make_replica is collected over both replicas and dataset_input groups, - # in that order. What this means is that groups_replicas_indexed_make_replica is a list of size - # number_of_replicas x number_of_data_groups. Where the ordering inside the list is as follows: - # [data1_rep1, data2_rep1, ..., datan_rep1, ..., data1_repn, data2_repn, ..., datan_repn]. - - # To correctly put this into a single dataframe, we first need to know the number of - # dataset_input groups there are for each replica - groups_per_replica = len(groups_replicas_indexed_make_replica) // len(replicas) - # then we make a list of pandas dataframes, each containing the pseudodata of all datasets - # generated for a single replica - df = [ - pd.concat(groups_replicas_indexed_make_replica[i : i + groups_per_replica]) - for i in range(0, len(groups_replicas_indexed_make_replica), groups_per_replica) - ] - # then we concatentate the pseudodata of all replicas into a single dataframe - df = pd.concat(df, axis=1) - # and finally we add as column titles the replica name - df.columns = [f"replica {rep}" for rep in replicas] + df = pd.concat(experiment_indexed_make_replica) + df.columns = [f"replica {replica}"] return df +@_per_replica +@table +def pseudodata_table(replica_pseudodata): + """Save the pseudodata for the given replica. + Deactivate by setting ``fitting::savepseudodata: False`` + from within the fit runcard. + """ + return replica_pseudodata + + +@_per_replica @table -def training_pseudodata(pseudodata_table, training_mask): +def training_pseudodata(replica_pseudodata, replica_training_mask): """Save the training data for the given replica. Deactivate by setting ``fitting::savepseudodata: False`` from within the fit runcard. @@ -426,20 +468,21 @@ def training_pseudodata(pseudodata_table, training_mask): -------- :py:func:`validphys.n3fit_data.validation_pseudodata` """ - return pseudodata_table.loc[training_mask.values] + return replica_pseudodata.loc[replica_training_mask.values] +@_per_replica @table -def validation_pseudodata(pseudodata_table, training_mask): +def validation_pseudodata(replica_pseudodata, replica_training_mask): """Save the training data for the given replica. Deactivate by setting ``fitting::savepseudodata: False`` from within the fit runcard. See Also -------- - :py:func:`validphys.n3fit_data.training_pseudodata` + :py:func:`validphys.n3fit_data.validation_pseudodata` """ - return pseudodata_table.loc[~training_mask.values] + return replica_pseudodata.loc[~replica_training_mask.values] exps_tr_masks = collect("tr_masks", ("group_dataset_inputs_by_experiment",))