Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Treat 1-point datasets equally in sequential and parallel fits #2276

Merged
merged 11 commits into from
Feb 20, 2025
Merged
10 changes: 4 additions & 6 deletions doc/sphinx/source/n3fit/runcard_detailed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,10 +318,8 @@ flag in the runcard to ``true`` when running a range of replicas.
Running in parallel can be quite hard on memory and it is only advantageous when
fitting on a GPU, where one can find a speed up equal to the number of models run
in parallel (each model being a different replica).

When running in parallel it might be advantageous (e.g., for debugging)
to set the training validation split to be equal for all replicas,
this can be done with the `same_trvl_per_replica: true` runcard flag.
Running in parallel models produces the exact same pseudodata as the sequential runs.
Note that numerical differences might be generated during the training

In other words, in order to run several replicas in parallel in a machine
(be it a big CPU or, most likely, a GPU)
Expand All @@ -332,8 +330,8 @@ top-level option:

parallel_models: true

Note that currently, in order to run with parallel models, one has to set ``savepseudodata: false``
in the ``fitting`` section of the runcard. Once this is done, the user can run ``n3fit`` with a

Once this is done, the user can run ``n3fit`` with a
replica range to be parallelized (in this case from replica 1 to replica 4).

.. code-block:: bash
Expand Down
41 changes: 27 additions & 14 deletions n3fit/src/n3fit/model_gen.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Library of functions which generate the NN objects
Library of functions which generate the NN objects

Contains:
# observable_generator:
Generates the output layers as functions
# pdfNN_layer_generator:
Generates the PDF NN layer to be fitted
Contains:
# observable_generator:
Generates the output layers as functions
# pdfNN_layer_generator:
Generates the PDF NN layer to be fitted


"""
Expand All @@ -26,7 +26,6 @@
base_layer_selector,
)
from n3fit.backends import operations as op
from n3fit.backends import regularizer_selector
from n3fit.layers import (
DIS,
DY,
Expand All @@ -42,6 +41,8 @@
from n3fit.msr import generate_msr_model_and_grid
from validphys.photon.compute import Photon

from n3fit.backends import regularizer_selector # isort: skip isort and black don't agree


@dataclass
class ObservableWrapper:
Expand Down Expand Up @@ -127,7 +128,8 @@ def __call__(self, pdf_layer, mask=None):
def observable_generator(
spec_dict,
boundary_condition=None,
mask_array=None,
training_mask_array=None,
validation_mask_array=None,
training_data=None,
validation_data=None,
invcovmat_tr=None,
Expand All @@ -142,7 +144,6 @@ def observable_generator(
the result of the observable for each contained dataset (n_points,).

In summary the model has the following structure:
One experiment layer, made of any number of observable layers.
Observable layers, corresponding to commondata datasets
and made of any number of fktables (and an operation on them).

Expand Down Expand Up @@ -170,6 +171,12 @@ def observable_generator(
boundary_condition: dict
dictionary containing the instance of the a PDF set to be used as a
Boundary Condition.
training_mask_array: np.ndarray
training mask per replica
validation_mask_array: np.ndarray
validation mask per replica, when not given ~training_mask_array will be used
while in general the validation is a negation of the training, in special cases
such as 1-point datasets, these are accepted by both masks and then removed by the loss
n_replicas: int
number of replicas fitted simultaneously
positivity_initial: float
Expand Down Expand Up @@ -245,12 +252,18 @@ def observable_generator(
model_inputs = np.concatenate(model_inputs).reshape(1, -1)

# Make the mask layers...
if mask_array is not None:
tr_mask_layer = Mask(mask_array, name=f"trmask_{spec_name}")
vl_mask_layer = Mask(~mask_array, name=f"vlmask_{spec_name}")
else:
if training_mask_array is None:
tr_mask_layer = None
vl_mask_layer = None
if validation_mask_array is None:
vl_mask_layer = None
else:
vl_mask_layer = Mask(validation_mask_array, name=f"vlmask_{spec_name}")
else:
tr_mask_layer = Mask(training_mask_array, name=f"trmask_{spec_name}")
if validation_mask_array is None:
vl_mask_layer = Mask(~training_mask_array, name=f"vlmask_{spec_name}")
else:
vl_mask_layer = Mask(validation_mask_array, name=f"vlmask_{spec_name}")

# Make rotations of the final data (if any)
if spec_dict.get("data_transformation") is not None:
Expand Down
78 changes: 54 additions & 24 deletions n3fit/src/n3fit/model_trainer.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
"""
The ModelTrainer class is the true driver around the n3fit code
The ModelTrainer class is the true driver around the n3fit code

This class is initialized with all information about the NN, inputs and outputs.
The construction of the NN and the fitting is performed at the same time when the
hyperparametrizable method of the function is called.
This class is initialized with all information about the NN, inputs and outputs.
The construction of the NN and the fitting is performed at the same time when the
hyperparametrizable method of the function is called.

This allows to use hyperscanning libraries, that need to change the parameters of the network
between iterations while at the same time keeping the amount of redundant calls to a minimum
This allows to use hyperscanning libraries, that need to change the parameters of the network
between iterations while at the same time keeping the amount of redundant calls to a minimum
"""

import logging
from collections import namedtuple
from itertools import zip_longest
import logging

import numpy as np

import n3fit.hyper_optimization.penalties
import n3fit.hyper_optimization.rewards
from n3fit import model_gen
from n3fit.backends import NN_LAYER_ALL_REPLICAS, Lambda, MetaModel, callbacks, clear_backend_state
from n3fit.backends import operations as op
from n3fit.hyper_optimization.hyper_scan import HYPEROPT_STATUSES
import n3fit.hyper_optimization.penalties
import n3fit.hyper_optimization.rewards
from n3fit.hyper_optimization.rewards import HyperLoss
from n3fit.scaler import generate_scaler
from n3fit.stopping import Stopping
Expand Down Expand Up @@ -151,7 +151,6 @@ def __init__(
self.exp_info = list(exp_info)
self.pos_info = [] if pos_info is None else pos_info
self.integ_info = [] if integ_info is None else integ_info
self.all_info = self.exp_info[0] + self.pos_info + self.integ_info
self.boundary_condition = boundary_condition
self.flavinfo = flavinfo
self.fitbasis = fitbasis
Expand Down Expand Up @@ -528,9 +527,12 @@ def _generate_observables(
self._reset_observables()
log.info("Generating layers")

# We need to transpose Experimental data, stacking over replicas
# validphys has generated the self.exp_info information replica-by-replica
# Here we transpose all information for convenience so that the loop over observables
# and the vectorization over replicas is made explicit
experiment_data = {
"trmask": [],
"vlmask": [],
"expdata": [],
"expdata_vl": [],
"invcovmat": [],
Expand Down Expand Up @@ -561,7 +563,8 @@ def _generate_observables(
exp_layer = model_gen.observable_generator(
exp_dict,
self.boundary_condition,
mask_array=experiment_data["trmask"][i],
training_mask_array=experiment_data["trmask"][i],
validation_mask_array=experiment_data["vlmask"][i],
training_data=experiment_data["expdata"][i],
validation_data=experiment_data["expdata_vl"][i],
invcovmat_tr=experiment_data["invcovmat"][i],
Expand Down Expand Up @@ -596,7 +599,7 @@ def _generate_observables(
pos_dict,
self.boundary_condition,
positivity_initial=pos_initial,
mask_array=replica_masks,
training_mask_array=replica_masks,
training_data=training_data,
validation_data=training_data,
n_replicas=len(self.replicas),
Expand Down Expand Up @@ -712,20 +715,47 @@ def _prepare_reporting(self, partition):
to select the bits necessary for reporting the chi2.
Receives the chi2 partition data to see whether any dataset is to be left out
"""
reported_keys = ["name", "count_chi2", "positivity", "integrability", "ndata", "ndata_vl"]
reported_keys = ["name", "count_chi2", "positivity", "integrability"]
reporting_list = []
for exp_dict in self.all_info:

# Most of the information is shared among replicas, only ndata/ndata_vl
# might change replica to replica and they need to be filled with care
for idx, exp_dict in enumerate(self.exp_info[0]):
# Fill in the keys that are equal across replicas
reporting_dict = {k: exp_dict.get(k) for k in reported_keys}

# Now loop over replicas to fill in all data points as a list
list_ndata = []
list_ndata_vl = []
for replica in self.exp_info:
replica_exp_dict = replica[idx]

ndata = replica_exp_dict.get("ndata")
ndata_vl = replica_exp_dict.get("ndata_vl")

if partition:
# If we are in a k-fold partition, we need to remove the folded data
# from both the training and validation to avoid calculating the chi2 wrong
for dataset in replica_exp_dict["datasets"]:
if dataset in partition["datasets"]:
dataset_ndata = dataset["ndata"]
frac = dataset["frac"]
ndata -= int(dataset_ndata * frac)
ndata_vl -= int(dataset_ndata * (1 - frac))

list_ndata.append(ndata)
list_ndata_vl.append(ndata_vl)

reporting_dict["ndata"] = list_ndata
reporting_dict["ndata_vl"] = list_ndata_vl
reporting_list.append(reporting_dict)

for exp_dict in self.pos_info + self.integ_info:
reporting_dict = {k: exp_dict.get(k) for k in reported_keys}
if partition:
# If we are in a partition we need to remove the number of datapoints
# in order to avoid calculating the chi2 wrong
for dataset in exp_dict["datasets"]:
if dataset in partition["datasets"]:
ndata = dataset["ndata"]
frac = dataset["frac"]
reporting_dict["ndata"] -= int(ndata * frac)
reporting_dict["ndata_vl"] = int(ndata * (1 - frac))
reporting_dict["ndata"] = [exp_dict.get("ndata")]
reporting_dict["ndata_vl"] = [exp_dict.get("ndata_vl")]
reporting_list.append(reporting_dict)

return reporting_list

def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool:
Expand Down
26 changes: 16 additions & 10 deletions n3fit/src/n3fit/scripts/n3fit_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
TAB_FOLDER = "tables"


# Supress the arguments that we don't want the help of n3fit to show
# note that these would still be parsed by vp/reportengine
SUPPRESS = ["parallel", "no-parallel", "scheduler", "style", "format"]


class N3FitError(Exception):
"""Exception raised when n3fit cannot succeed and knows why"""

Expand Down Expand Up @@ -127,17 +132,14 @@ def from_yaml(cls, o, *args, **kwargs):
if fps := file_content["fitting"].get("savepseudodata", True):
if fps != True:
raise TypeError(f"fitting::savepseudodata is neither True nor False ({fps})")
if len(kwargs["environment"].replicas) != 1:
raise ConfigError(
"Cannot request that multiple replicas are fitted and that "
"pseudodata is saved. Either set `fitting::savepseudodata` "
"to `false` or fit replicas one at a time."
)
# take same namespace configuration on the pseudodata_table action.
training_action = namespace + "training_pseudodata"
validation_action = namespace + "validation_pseudodata"

N3FIT_FIXED_CONFIG['actions_'].extend((training_action, validation_action))
training_action = namespace + "replicas_training_pseudodata"
validation_action = namespace + "replicas_validation_pseudodata"
all_data_action = namespace + "replicas_pseudodata"

N3FIT_FIXED_CONFIG['actions_'].extend(
(training_action, validation_action, all_data_action)
)

if thconfig := file_content.get('fiatlux'):
N3FIT_FIXED_CONFIG['fiatlux'] = thconfig
Expand Down Expand Up @@ -245,6 +247,10 @@ def __init__(self):
@property
def argparser(self):
parser = super().argparser

for argo in SUPPRESS:
parser.add_argument(f"--{argo}", help=argparse.SUPPRESS)

parser.add_argument(
"-o", "--output", help="Output folder and name of the fit", default=None
)
Expand Down
Loading
Loading