Skip to content

Commit

Permalink
save pseudodata for each replica in the right folder ; add complicate…
Browse files Browse the repository at this point in the history
…d reportengine wrapper ; remove complicated validphys functio
  • Loading branch information
scarlehoff committed Feb 19, 2025
1 parent 1ac19a1 commit b6d8b40
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 44 deletions.
13 changes: 5 additions & 8 deletions n3fit/src/n3fit/scripts/n3fit_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,14 @@ def from_yaml(cls, o, *args, **kwargs):
if fps := file_content["fitting"].get("savepseudodata", True):
if fps != True:
raise TypeError(f"fitting::savepseudodata is neither True nor False ({fps})")
#if len(kwargs["environment"].replicas) != 1:
#raise ConfigError(
# "Cannot request that multiple replicas are fitted and that "
# "pseudodata is saved. Either set `fitting::savepseudodata` "
# "to `false` or fit replicas one at a time."
#)
# take same namespace configuration on the pseudodata_table action.

training_action = namespace + "replicas_training_pseudodata"
validation_action = namespace + "replicas_validation_pseudodata"
all_data_action = namespace + "replicas_pseudodata"

N3FIT_FIXED_CONFIG['actions_'].extend((training_action, validation_action))
N3FIT_FIXED_CONFIG['actions_'].extend(
(training_action, validation_action, all_data_action)
)

if thconfig := file_content.get('fiatlux'):
N3FIT_FIXED_CONFIG['fiatlux'] = thconfig
Expand Down
4 changes: 2 additions & 2 deletions n3fit/src/n3fit/stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def parse_ndata(all_data):
if dictionary.get("count_chi2"):
tr_ndata = dictionary["ndata"]
vl_ndata = dictionary["ndata_vl"]
if tr_ndata:
if sum(tr_ndata) != 0:
tr_ndata_dict[exp_name] = np.array(tr_ndata)
if vl_ndata:
if sum(vl_ndata) != 0:
vl_ndata_dict[exp_name] = np.array(vl_ndata)
if dictionary.get("positivity") and not dictionary.get("integrability"):
pos_set.append(exp_name)
Expand Down
111 changes: 77 additions & 34 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,66 @@
:py:func:`n3fit.performfit.performfit`.
"""

from collections import defaultdict
from collections import abc, defaultdict
from copy import copy
import functools
import hashlib
import logging

import numpy as np
import pandas as pd

from reportengine import collect
from reportengine import collect, namespaces
from reportengine.table import table
from validphys.core import IntegrabilitySetSpec, TupleComp
from validphys.n3fit_data_utils import validphys_group_extractor

log = logging.getLogger(__name__)


def _per_replica(f):
"""Decorator to be used on top of reportengine's decorator.
It replaces the preparation step of the decorator with a custom function,
which modifies the output behaviour when there is a collection of replicas.
If there is no ``replica_path`` in the environment or collection over replicas
this function does nothing. Otherwise, it removes the replica number from the
output file and directs the output to ``replica_<replica>`` instead.
"""
original_prepare = f.prepare

def prepare_replica_path(*, spec, namespace, environment, **kwargs):
if not hasattr(environment, "replica_path") or "replicas" not in namespace:
return original_prepare(spec=spec, namespace=namespace, environment=environment)

if not isinstance(namespace["replicas"], abc.Collection):
return original_prepare(spec=spec, namespace=namespace, environment=environment)

# Now loop over the function input to get the replica collection]
# which we will then remove
rnumber = None
new_nsspec = []
for farg in spec.nsspec:
if isinstance(farg, abc.Collection) and farg[0] == "replicas":
rnumber = namespaces.value_from_spcec_ele(namespace, farg)
else:
new_nsspec.append(farg)
if rnumber is None:
raise ValueError("Wrong call to @_replica_table")

replica_path = environment.replica_path / f"replica_{rnumber}"

new_env = copy(environment)
new_env.table_folder = replica_path
new_spec = spec._replace(nsspec=tuple(new_nsspec))

return original_prepare(spec=new_spec, namespace=namespace, environment=new_env)

f.prepare = prepare_replica_path

return f


def replica_trvlseed(replica, trvlseed, same_trvl_per_replica=False):
"""Generates the ``trvlseed`` for a ``replica``."""
# TODO: move to the new infrastructure
Expand Down Expand Up @@ -74,7 +118,7 @@ def __iter__(self):
yield from self.masks


def tr_masks(data, replica_trvlseed, parallel_models=False, replica=1, replicas=(1,)):
def tr_masks(data, replica_trvlseed):
"""Generate the boolean masks used to split data into training and
validation points. Returns a list of 1-D boolean arrays, one for each
dataset. Each array has length equal to N_data, the datapoints which
Expand Down Expand Up @@ -374,50 +418,48 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn
"""
return (replica, exps_fitting_data_dict, replica_nnseed)


replicas_training_pseudodata = collect("training_pseudodata", ("replicas",))
replicas_validation_pseudodata = collect("validation_pseudodata", ("replicas",))
replicas_pseudodata = collect("pseudodata_table", ("replicas",))
replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",))
groups_replicas_indexed_make_replica = collect(
"indexed_make_replica", ("replicas", "group_dataset_inputs_by_experiment")
)
experiment_indexed_make_replica = collect(
"indexed_make_replica", ("group_dataset_inputs_by_experiment",)
)


@table
def pseudodata_table(groups_replicas_indexed_make_replica, replicas):
"""Creates a pandas DataFrame containing the generated pseudodata. The
index is :py:func:`validphys.results.experiments_index` and the columns
are the replica numbers.
def replica_pseudodata(experiment_indexed_make_replica, replica):
"""Creates a pandas DataFrame containing the generated pseudodata.
The index is :py:func:`validphys.results.experiments_index` and the columns
is the replica numbers.
Notes
-----
Whilst running ``n3fit``, this action will only be called if
`fitting::savepseudodata` is `true` (as per the default setting) and
replicas are fitted one at a time. The table can be found in the replica
folder i.e. <fit dir>/nnfit/replica_*/
`fitting::savepseudodata` is `true` (as per the default setting)
The table can be found in the replica folder i.e. <fit dir>/nnfit/replica_*/
"""
# groups_replicas_indexed_make_replica is collected over both replicas and dataset_input groups,
# in that order. What this means is that groups_replicas_indexed_make_replica is a list of size
# number_of_replicas x number_of_data_groups. Where the ordering inside the list is as follows:
# [data1_rep1, data2_rep1, ..., datan_rep1, ..., data1_repn, data2_repn, ..., datan_repn].

# To correctly put this into a single dataframe, we first need to know the number of
# dataset_input groups there are for each replica
groups_per_replica = len(groups_replicas_indexed_make_replica) // len(replicas)
# then we make a list of pandas dataframes, each containing the pseudodata of all datasets
# generated for a single replica
df = [
pd.concat(groups_replicas_indexed_make_replica[i : i + groups_per_replica])
for i in range(0, len(groups_replicas_indexed_make_replica), groups_per_replica)
]
# then we concatentate the pseudodata of all replicas into a single dataframe
df = pd.concat(df, axis=1)
# and finally we add as column titles the replica name
df.columns = [f"replica {rep}" for rep in replicas]
df = pd.concat(experiment_indexed_make_replica)
df.columns = [f"replica {replica}"]
return df


@_per_replica
@table
def pseudodata_table(replica_pseudodata):
"""Save the pseudodata for the given replica.
Deactivate by setting ``fitting::savepseudodata: False``
from within the fit runcard.
"""
return replica_pseudodata


@_per_replica
@table
def training_pseudodata(pseudodata_table, training_mask):
def training_pseudodata(replica_pseudodata, replica_training_mask):
"""Save the training data for the given replica.
Deactivate by setting ``fitting::savepseudodata: False``
from within the fit runcard.
Expand All @@ -426,20 +468,21 @@ def training_pseudodata(pseudodata_table, training_mask):
--------
:py:func:`validphys.n3fit_data.validation_pseudodata`
"""
return pseudodata_table.loc[training_mask.values]
return replica_pseudodata.loc[replica_training_mask.values]


@_per_replica
@table
def validation_pseudodata(pseudodata_table, training_mask):
def validation_pseudodata(replica_pseudodata, replica_training_mask):
"""Save the training data for the given replica.
Deactivate by setting ``fitting::savepseudodata: False``
from within the fit runcard.
See Also
--------
:py:func:`validphys.n3fit_data.training_pseudodata`
:py:func:`validphys.n3fit_data.validation_pseudodata`
"""
return pseudodata_table.loc[~training_mask.values]
return replica_pseudodata.loc[~replica_training_mask.values]


exps_tr_masks = collect("tr_masks", ("group_dataset_inputs_by_experiment",))
Expand Down

0 comments on commit b6d8b40

Please sign in to comment.