Skip to content

Commit

Permalink
test that parallel and sequential runs produce exactly the same fits …
Browse files Browse the repository at this point in the history
…and pseudodata
  • Loading branch information
scarlehoff committed Feb 19, 2025
1 parent d38c030 commit 9aab465
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 14 deletions.
10 changes: 4 additions & 6 deletions doc/sphinx/source/n3fit/runcard_detailed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,10 +318,8 @@ flag in the runcard to ``true`` when running a range of replicas.
Running in parallel can be quite hard on memory and it is only advantageous when
fitting on a GPU, where one can find a speed up equal to the number of models run
in parallel (each model being a different replica).

When running in parallel it might be advantageous (e.g., for debugging)
to set the training validation split to be equal for all replicas,
this can be done with the `same_trvl_per_replica: true` runcard flag.
Running in parallel models produces the exact same pseudodata as the sequential runs.
Note that numerical differences might be generated during the training

In other words, in order to run several replicas in parallel in a machine
(be it a big CPU or, most likely, a GPU)
Expand All @@ -332,8 +330,8 @@ top-level option:
parallel_models: true
Note that currently, in order to run with parallel models, one has to set ``savepseudodata: false``
in the ``fitting`` section of the runcard. Once this is done, the user can run ``n3fit`` with a
Once this is done, the user can run ``n3fit`` with a
replica range to be parallelized (in this case from replica 1 to replica 4).

.. code-block:: bash
Expand Down
9 changes: 9 additions & 0 deletions n3fit/src/n3fit/scripts/n3fit_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
TAB_FOLDER = "tables"


# Supress the arguments that we don't want the help of n3fit to show
# note that these would still be parsed by vp/reportengine
SUPPRESS = ["parallel", "no-parallel", "scheduler", "style", "format"]


class N3FitError(Exception):
"""Exception raised when n3fit cannot succeed and knows why"""

Expand Down Expand Up @@ -242,6 +247,10 @@ def __init__(self):
@property
def argparser(self):
parser = super().argparser

for argo in SUPPRESS:
parser.add_argument(f"--{argo}", help=argparse.SUPPRESS)

parser.add_argument(
"-o", "--output", help="Output folder and name of the fit", default=None
)
Expand Down
88 changes: 80 additions & 8 deletions n3fit/src/n3fit/tests/test_fit.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""
Regression tests for n3fit
Regression tests for n3fit
This file will run a fit with a runcard which includes:
- A DIS dataset
- A Hadronic dataset
- Two positivity sets
And checks that the results have not changed from the previous iteration of the code
This file will run a fit with a runcard which includes:
- A DIS dataset
- A Hadronic dataset
- Two positivity sets
And checks that the results have not changed from the previous iteration of the code
If the results are known to need a change,
it is necessary to flag _something_ to regenerate regression
If the results are known to need a change,
it is necessary to flag _something_ to regenerate regression
"""

import json
Expand All @@ -19,6 +19,7 @@

import h5py
from numpy.testing import assert_allclose, assert_equal
import pandas as pd
import pytest

import n3fit
Expand Down Expand Up @@ -240,6 +241,77 @@ def test_multireplica_runs(tmp_path, runcard):
compare_weights(option_1, option_2, file_1, file_2)


@pytest.mark.linux
def test_parallel_against_sequential(tmp_path, rep_from=6, rep_to=8):
"""Checks that running in parallel and sequentially produces exactly the same results.
This test runs several fits:
1. A sequential fit of 3 replicas in a loop (6 to 8), (rep_from to rep_to)
2. A parallel fit from replica 6 to 8
And checks:
1) The .csv generated by the fit:
a) The same pseudodata has been generated by ``make_replica``
b) Exaclty the same cuts are being used in the parallel and sequential fits
c) And can be reproduced!
2) The .json file that contains the fit parameters and results,
at one epoch numerical differences between sequential and parallel fits
"""
input_card = REGRESSION_FOLDER / QUICKNAME
card_parallel = tmp_path / "parallel.yml"
card_sequenti = tmp_path / "sequenti.yml"

n3fit_input = yaml_safe.load(input_card.with_suffix(".yml"))
n3fit_input["debug"] = False
n3fit_input.pop("load")

# Complicate slightly the choice of dataset so that different scenarios are tested
datasets = [
"HERA_CC_318GEV_EM-SIGMARED",
"HERA_CC_318GEV_EP-SIGMARED",
"ATLAS_Z0_7TEV_49FB_HIMASS",
"ATLAS_TTBAR_8TEV_TOT_X-SEC",
"CMS_SINGLETOP_13TEV_TCHANNEL-XSEC",
]
dataset_inputs = [{"dataset": d, "frac": 0.6, "variant": "legacy"} for d in datasets]
n3fit_input["dataset_inputs"] = dataset_inputs
# Exit inmediately
n3fit_input["parameters"]["epochs"] = 1
# Save pseudodata
n3fit_input["fitting"]["savepseudodata"] = True

n3fit_input["parallel_models"] = False
yaml_safe.dump(n3fit_input, card_sequenti)
n3fit_input["parallel_models"] = True
yaml_safe.dump(n3fit_input, card_parallel)

name_seq = card_sequenti.with_suffix("").name
name_par = card_parallel.with_suffix("").name

# Now run both
for r in range(rep_from, rep_to + 1):
sp.run(f"{EXE} {card_sequenti} {r}".split(), cwd=tmp_path, check=True)
sp.run(f"{EXE} {card_parallel} {rep_from} -r {rep_to}".split(), cwd=tmp_path, check=True)

# Loop over all pseudodata files for both fits and load them up
folder_seq = card_sequenti.with_suffix("") / "nnfit"
folder_par = card_parallel.with_suffix("") / "nnfit"

# Both should have exactly the same pseudodata in the same locations
for csvfile_seq in folder_seq.glob("*/*.csv"):
csvfile_par = folder_par / csvfile_seq.relative_to(folder_seq)

result_seq = pd.read_csv(csvfile_seq, sep="\t", index_col=[0, 1, 2], header=0)
result_par = pd.read_csv(csvfile_par, sep="\t", index_col=[0, 1, 2], header=0)
pd.testing.assert_frame_equal(result_seq, result_par)

# Check the rest of the fit, while numerical differences are expected between sequential
# and parallel runs, one single epoch should not be enough to generate them
for r in range(rep_from, rep_to + 1):
seq_json = folder_seq / f"replica_{r}" / f"{name_seq}.json"
check_fit_results(tmp_path, name_par, r, seq_json)


def compare_weights(option_1, option_2, file_1, file_2):
"""Reads two weight files and checks that the weights are the same between the two"""
for key in file_1.keys():
Expand Down

0 comments on commit 9aab465

Please sign in to comment.