Skip to content

Commit

Permalink
treat 1-point datasets equally in sequential and parallel fits
Browse files Browse the repository at this point in the history
  • Loading branch information
scarlehoff committed Feb 12, 2025
1 parent 99f4945 commit 77f259d
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 68 deletions.
37 changes: 24 additions & 13 deletions n3fit/src/n3fit/model_gen.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Library of functions which generate the NN objects
Library of functions which generate the NN objects
Contains:
# observable_generator:
Generates the output layers as functions
# pdfNN_layer_generator:
Generates the PDF NN layer to be fitted
Contains:
# observable_generator:
Generates the output layers as functions
# pdfNN_layer_generator:
Generates the PDF NN layer to be fitted
"""
Expand All @@ -26,7 +26,7 @@
base_layer_selector,
)
from n3fit.backends import operations as op
from n3fit.backends import regularizer_selector
from n3fit.backends import regularizer_selector as reg_sec
from n3fit.layers import (
DIS,
DY,
Expand Down Expand Up @@ -128,6 +128,7 @@ def observable_generator(
spec_dict,
boundary_condition=None,
mask_array=None,
validation_mask_array=None,
training_data=None,
validation_data=None,
invcovmat_tr=None,
Expand Down Expand Up @@ -170,6 +171,10 @@ def observable_generator(
boundary_condition: dict
dictionary containing the instance of the a PDF set to be used as a
Boundary Condition.
mask_array: np.ndarray
training mask per replica
validation_mask_array: np.ndarray
validation mask per replica, when not given ¬mask_array will be used
n_replicas: int
number of replicas fitted simultaneously
positivity_initial: float
Expand Down Expand Up @@ -245,12 +250,18 @@ def observable_generator(
model_inputs = np.concatenate(model_inputs).reshape(1, -1)

# Make the mask layers...
if mask_array is not None:
tr_mask_layer = Mask(mask_array, name=f"trmask_{spec_name}")
vl_mask_layer = Mask(~mask_array, name=f"vlmask_{spec_name}")
else:
if mask_array is None:
tr_mask_layer = None
vl_mask_layer = None
if validation_mask_array is None:
vl_mask_layer = None
else:
vl_mask_layer = Mask(validation_mask_array, name=f"vlmask_{spec_name}")
else:
tr_mask_layer = Mask(mask_array, name=f"trmask_{spec_name}")
if validation_mask_array is None:
vl_mask_layer = Mask(~mask_array, name=f"vlmask_{spec_name}")
else:
vl_mask_layer = Mask(validation_mask_array, name=f"vlmask_{spec_name}")

# Make rotations of the final data (if any)
if spec_dict.get("data_transformation") is not None:
Expand Down Expand Up @@ -724,7 +735,7 @@ def generate_nn(
"""
nodes_list = list(nodes) # so we can modify it
x_input = Input(shape=(None, nodes_in), batch_size=1, name="NN_input")
reg = regularizer_selector(regularizer, **regularizer_args)
reg = reg_sec(regularizer, **regularizer_args)

if layer_type == "dense_per_flavour":
# set the arguments that will define the layer
Expand Down
18 changes: 11 additions & 7 deletions n3fit/src/n3fit/model_trainer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""
The ModelTrainer class is the true driver around the n3fit code
The ModelTrainer class is the true driver around the n3fit code
This class is initialized with all information about the NN, inputs and outputs.
The construction of the NN and the fitting is performed at the same time when the
hyperparametrizable method of the function is called.
This class is initialized with all information about the NN, inputs and outputs.
The construction of the NN and the fitting is performed at the same time when the
hyperparametrizable method of the function is called.
This allows to use hyperscanning libraries, that need to change the parameters of the network
between iterations while at the same time keeping the amount of redundant calls to a minimum
This allows to use hyperscanning libraries, that need to change the parameters of the network
between iterations while at the same time keeping the amount of redundant calls to a minimum
"""

from collections import namedtuple
Expand Down Expand Up @@ -528,9 +528,12 @@ def _generate_observables(
self._reset_observables()
log.info("Generating layers")

# We need to transpose Experimental data, stacking over replicas
# validphys has generated the self.exp_info information replica-by-replica
# Here we transpose all information for convenience so that the loop over observables
# and the vectorization over replicas is made explicit
experiment_data = {
"trmask": [],
"vlmask": [],
"expdata": [],
"expdata_vl": [],
"invcovmat": [],
Expand Down Expand Up @@ -562,6 +565,7 @@ def _generate_observables(
exp_dict,
self.boundary_condition,
mask_array=experiment_data["trmask"][i],
validation_mask_array=experiment_data["vlmask"][i],
training_data=experiment_data["expdata"][i],
validation_data=experiment_data["expdata_vl"][i],
invcovmat_tr=experiment_data["invcovmat"][i],
Expand Down
70 changes: 52 additions & 18 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,8 @@ def tr_masks(data, replica_trvlseed, parallel_models=False, replica=1, replicas=
# We do this so that a given dataset will always have the same number of points masked
trmax = int(ndata * frac)
if trmax == 0:
if parallel_models:
if replica == replicas[0]:
log.warning(
f'Single-datapoint dataset {dataset.name} encountered in parallel multi-replica fit: '
'all replicas will include it in their training data'
)
trmax = 1
else:
# If that number is 0, then get 1 point with probability frac
trmax = int(rng.random() < frac)
# If that number is 0, then get 1 point with probability frac
trmax = int(rng.random() < frac)
mask = np.concatenate([np.ones(trmax, dtype=bool), np.zeros(ndata - trmax, dtype=bool)])
rng.shuffle(mask)
trmask_partial.append(mask)
Expand Down Expand Up @@ -181,13 +173,13 @@ def kfold_masks(kpartitions, data):


@functools.lru_cache
def fittable_datasets_masked(data, tr_masks):
def fittable_datasets_masked(data):
"""Generate a list of :py:class:`validphys.n3fit_data_utils.FittableDataSet`
from a group of dataset and the corresponding training/validation masks
"""
# This is separated from fitting_data_dict so that we can cache the result
# when the trvlseed is the same for all replicas (great for parallel replicas)
return validphys_group_extractor(data.datasets, tr_masks.masks)
return validphys_group_extractor(data.datasets)


def fitting_data_dict(
Expand Down Expand Up @@ -259,9 +251,31 @@ def fitting_data_dict(
dt_trans_tr = None
dt_trans_vl = None

# In the fittable datasets the fktables masked for 1-point datasets will be set to 0
# Here we want to have the data both in training and validation,
# but set to 0 the data, so that it doesn't affect the chi2 value.
zero_tr = []
zero_vl = []
idx = 0
for data_mask in tr_masks:
dlen = len(data_mask)
if dlen == 1:
if data_mask[0]:
zero_vl.append(idx)
else:
zero_tr.append(idx)
idx += dlen

tr_mask = np.concatenate(tr_masks)
vl_mask = ~tr_mask

# Now set to true the masks
tr_mask[zero_tr] = True
vl_mask[zero_vl] = True
# And prepare the index to 0 the (inverse) covmat
data_zero_tr = np.cumsum(tr_mask)[zero_tr] - 1
data_zero_vl = np.cumsum(vl_mask)[zero_vl] - 1

if diagonal_basis:
expdata = np.matmul(dt_trans, expdata)
# make a 1d array of the diagonal
Expand All @@ -274,18 +288,38 @@ def fitting_data_dict(
# prepare a masking rotation
dt_trans_tr = dt_trans[tr_mask]
dt_trans_vl = dt_trans[vl_mask]

# TODO: check the effect of this when diagonalization
invcovmat_tr[data_zero_tr] = 0.0
invcovmat_vl[data_zero_vl] = 0.0
else:
covmat_tr = covmat[tr_mask].T[tr_mask]
invcovmat_tr = np.linalg.inv(covmat_tr)

covmat_vl = covmat[vl_mask].T[vl_mask]

# Remove possible correlations for 1-point datasets
# that should've been masked out
covmat_tr[data_zero_tr, :] = covmat_tr[:, data_zero_tr] = 0.0
covmat_vl[data_zero_vl, :] = covmat_vl[:, data_zero_vl] = 0.0
# Avoid infinities
covmat_tr[np.ix_(data_zero_tr, data_zero_tr)] = 1.0
covmat_vl[np.ix_(data_zero_vl, data_zero_vl)] = 1.0

invcovmat_tr = np.linalg.inv(covmat_tr)
invcovmat_vl = np.linalg.inv(covmat_vl)

ndata_tr = np.count_nonzero(tr_mask)
expdata_tr = expdata[tr_mask].reshape(1, ndata_tr)
# Set to 0 the points in the diagonal that were left as 1
invcovmat_tr[np.ix_(data_zero_tr, data_zero_tr)] = 0.0
invcovmat_vl[np.ix_(data_zero_vl, data_zero_vl)] = 0.0

ndata_tr = np.count_nonzero(tr_mask)
ndata_vl = np.count_nonzero(vl_mask)
expdata_vl = expdata[vl_mask].reshape(1, ndata_vl)

# And subtract them for ndata
ndata_tr -= len(data_zero_tr)
ndata_vl -= len(data_zero_vl)

expdata_tr = expdata[tr_mask].reshape(1, -1)
expdata_vl = expdata[vl_mask].reshape(1, -1)

# Now save a dictionary of training/validation/experimental folds
# for training and validation we need to apply the tr/vl masks
Expand Down Expand Up @@ -539,7 +573,7 @@ def _fitting_lagrange_dict(lambdadataset):
integrability = isinstance(lambdadataset, IntegrabilitySetSpec)
mode = "integrability" if integrability else "positivity"
log.info("Loading %s dataset %s", mode, lambdadataset)
positivity_datasets = validphys_group_extractor([lambdadataset], [])
positivity_datasets = validphys_group_extractor([lambdadataset])
ndata = positivity_datasets[0].ndata
return {
"datasets": positivity_datasets,
Expand Down
34 changes: 4 additions & 30 deletions validphys2/src/validphys/n3fit_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
The ``validphys_group_extractor`` will loop over every dataset of a given group
loading their fktables (and applying any necessary cuts).
"""
import dataclasses
from itertools import zip_longest

import numpy as np
import dataclasses


@dataclasses.dataclass
Expand Down Expand Up @@ -38,16 +36,6 @@ class FittableDataSet:

# Things that can have default values:
operation: str = "NULL"
frac: float = 1.0
training_mask: np.ndarray = None # boolean array

def __post_init__(self):
self._tr_mask = None
self._vl_mask = None
if self.training_mask is not None:
data_idx = self.fktables_data[0].sigma.index.get_level_values(0).unique()
self._tr_mask = data_idx[self.training_mask].values
self._vl_mask = data_idx[~self.training_mask].values

@property
def ndata(self):
Expand All @@ -63,20 +51,8 @@ def fktables(self):
"""Return the list of fktable tensors for the dataset"""
return [fk.get_np_fktable() for fk in self.fktables_data]

def training_fktables(self):
"""Return the fktable tensors for the trainig data"""
if self._tr_mask is not None:
return [fk.with_cuts(self._tr_mask).get_np_fktable() for fk in self.fktables_data]
return self.fktables()

def validation_fktables(self):
"""Return the fktable tensors for the validation data"""
if self._vl_mask is not None:
return [fk.with_cuts(self._vl_mask).get_np_fktable() for fk in self.fktables_data]
return self.fktables()


def validphys_group_extractor(datasets, tr_masks):
def validphys_group_extractor(datasets):
"""
Receives a grouping spec from validphys (most likely an experiment)
and loops over its content extracting and parsing all information required for the fit
Expand All @@ -85,18 +61,16 @@ def validphys_group_extractor(datasets, tr_masks):
----------
datasets: list(:py:class:`validphys.core.DataSetSpec`)
List of dataset specs in this group
tr_masks: list(np.array)
List of training masks to be set for each dataset
Returns
-------
loaded_obs: list (:py:class:`validphys.n3fit_data_utils.FittableDataSet`)
"""
loaded_obs = []
# Use zip_longest since tr_mask can be (and it is fine) an empty list
for dspec, mask in zip_longest(datasets, tr_masks):
for dspec in datasets:
# Load all fktables with the appropiate cuts
fktables = [fk.load_with_cuts(dspec.cuts) for fk in dspec.fkspecs]
# And now put them in a FittableDataSet object which
loaded_obs.append(FittableDataSet(dspec.name, fktables, dspec.op, dspec.frac, mask))
loaded_obs.append(FittableDataSet(dspec.name, fktables, dspec.op))
return loaded_obs

0 comments on commit 77f259d

Please sign in to comment.