NNPDF · achiefa · Jun 5, 2026 · Jun 11, 2026 · Jun 27, 2026
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py b/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py
@@ -1,21 +1,105 @@
 """
-    The class MetaLayer is an extension of the backend Layer class
-    with a number of methods and helpers to facilitate writing new custom layers
-    in such a way that the new custom layer don't need to rely in anything backend-dependent
+The class MetaLayer is an extension of the backend Layer class
+with a number of methods and helpers to facilitate writing new custom layers
+in such a way that the new custom layer don't need to rely in anything backend-dependent
 
-    In other words, if you want to implement a new layer and need functions not included here
-    it is better to add a new method which is just a call to the relevant backend-dependent function
-    For instance: np_to_tensor is just a call to K.constant
+In other words, if you want to implement a new layer and need functions not included here
+it is better to add a new method which is just a call to the relevant backend-dependent function
+For instance: np_to_tensor is just a call to K.constant
 """
 
-from keras.initializers import Constant, RandomUniform, glorot_normal, glorot_uniform
+import math
+
+from keras import random
+from keras.initializers import Constant, RandomUniform, VarianceScaling, glorot_uniform
 from keras.layers import Layer
 
+
+class GammaVarianceScaling(VarianceScaling):
+    """``VarianceScaling`` with a tunable exponent ``gamma`` on the variance.
+
+    keras' ``VarianceScaling`` draws weights with variance ``scale / fan`` (standard
+    deviation ``sqrt(scale / fan)``). This variant raises that variance to the power
+    ``gamma``:
+
+        variance = (scale / fan) ** gamma,   std = (scale / fan) ** (gamma / 2).
+
+    ``gamma = 1`` reproduces ``VarianceScaling`` (hence ``glorot_normal``) exactly;
+    ``gamma > 1`` makes the initialisation narrower, ``gamma < 1`` wider. ``fan`` is
+    ``fan_in``, ``fan_out`` or their average, per ``mode`` (``fan_avg`` for
+    ``glorot_normal``).
+    """
+
+    # keras' correction so a truncated normal has the requested std after truncation.
+    _TRUNCATED_CORRECTION = 0.87962566103423978
+
+    def __init__(
+        self, gamma=1.0, scale=1.0, mode="fan_in", distribution="truncated_normal", seed=None
+    ):
+        super().__init__(scale=scale, mode=mode, distribution=distribution, seed=seed)
+        self.gamma = gamma
+
+    @staticmethod
+    def _compute_fans(shape):
+        """fan_in, fan_out for a weight of the given shape (matches keras)."""
+        shape = tuple(shape)
+        if len(shape) < 1:
+            fan_in = fan_out = 1
+        elif len(shape) == 1:
+            fan_in = fan_out = shape[0]
+        elif len(shape) == 2:
+            fan_in, fan_out = shape
+        else:
+            receptive_field_size = 1
+            for dim in shape[:-2]:
+                receptive_field_size *= dim
+            fan_in = shape[-2] * receptive_field_size
+            fan_out = shape[-1] * receptive_field_size
+        return float(fan_in), float(fan_out)
+
+    def __call__(self, shape, dtype=None):
+        scale = self.scale
+        fan_in, fan_out = self._compute_fans(shape)
+        if self.mode == "fan_in":
+            scale /= max(1.0, fan_in)
+        elif self.mode == "fan_out":
+            scale /= max(1.0, fan_out)
+        else:
+            scale /= max(1.0, (fan_in + fan_out) / 2.0)
+        # `scale` is now the post-division variance scale/fan; keras would take
+        # std = sqrt(scale). Raise the *variance* to gamma -> std = scale**(gamma/2).
+        # gamma=1 gives sqrt(scale) (standard glorot).
+        std = scale ** (self.gamma / 2.0)
+        if self.distribution == "truncated_normal":
+            return random.truncated_normal(
+                shape,
+                mean=0.0,
+                stddev=std / self._TRUNCATED_CORRECTION,
+                dtype=dtype,
+                seed=self.seed,
+            )
+        elif self.distribution == "untruncated_normal":
+            return random.normal(shape, mean=0.0, stddev=std, dtype=dtype, seed=self.seed)
+        else:  # uniform: keras uses limit = sqrt(3 * variance) = sqrt(3) * std
+            limit = math.sqrt(3.0) * std
+            return random.uniform(shape, minval=-limit, maxval=limit, dtype=dtype, seed=self.seed)
+
+    def get_config(self):
+        return {**super().get_config(), "gamma": self.gamma}
+
+
 # Define in this dictionary new initializers as well as the arguments they accept (with default values if needed be)
 initializers = {
     "random_uniform": (RandomUniform, {"minval": -0.5, "maxval": 0.5}),
     "glorot_uniform": (glorot_uniform, {}),
-    "glorot_normal": (glorot_normal, {}),
+    # glorot_normal expressed via GammaVarianceScaling so its width is tunable through
+    # `scale` (variance multiplier) and `gamma` (exponent on the variance:
+    # variance = (scale/fan)**gamma). scale=1.0, gamma=1.0 reproduces keras'
+    # glorot_normal exactly; gamma>1 narrower, gamma<1 wider.
+    "glorot_normal": (
+        GammaVarianceScaling,
+        {"scale": 1.0, "gamma": 1.0, "mode": "fan_avg", "distribution": "untruncated_normal"},
+    ),
 }
 
 
@@ -91,10 +175,11 @@ def select_initializer(ini_name, seed=None, **kwargs):
             ) from e
 
         ini_class = ini_tuple[0]
-        ini_args = ini_tuple[1]
+        # Copy so per-call overrides (seed, scale, ...) don't leak into the shared defaults
+        ini_args = dict(ini_tuple[1])
         ini_args["seed"] = seed
 
         for key, value in kwargs.items():
-            if key in ini_args.keys():
+            if key in ini_args:
                 ini_args[key] = value
         return ini_class(**ini_args)
diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py
@@ -13,6 +13,7 @@
 """
 
 import logging
+from pathlib import Path
 from time import time
 
 from keras import backend as K
@@ -196,6 +197,71 @@ def on_step_end(self, epoch, logs=None):
             self._update_weights()
 
 
+class StoreCallback(CallbackStep):
+    """
+    Given a ``savedir``, the callback will store the model parameters in
+    that directory every ``check_freq`` epochs.
+
+    Parameters
+    ----------
+        pdf_model: MetaModel
+            The multi-replica PDF model
+        replica_paths: list[Path]
+            One path for replica. Weights are saved under <path>/weights/.
+        check_freq: int
+            Save every this many epochs (default: 100)
+    """
+
+    def __init__(self, pdf_model, replica_paths, stopping_object, check_freq=100):
+        super().__init__()
+        self.check_freq = check_freq
+        self.pdf_model = pdf_model
+        self.weight_dirs = []
+        self.stopping_object = stopping_object
+        for path in replica_paths:
+            weight_dir = path / "parameters"
+            weight_dir.mkdir(parents=True, exist_ok=True)
+            self.weight_dirs.append(weight_dir)
+
+    def _save_weights(self, epoch, tr_weights, weight_dir):
+        filepath = weight_dir / f"params_{epoch}.npz"
+        # save parameters as expected by colibri
+        trainable_weights_flat = np.concatenate([np.asarray(w).flatten() for w in tr_weights])
+        np.savez(filepath, params=trainable_weights_flat)
+        log.info(f"Saved parameters at epoch {epoch} in {filepath}")
+
+    def on_train_begin(self, logs=None):
+        """Store the model parameters at initialisation (epoch 0), before any
+        gradient step has been taken."""
+        pdf_replicas = self.pdf_model.split_replicas()
+        for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs):
+            self._save_weights(0, replica_model.trainable_weights, weight_dir)
+
+    def on_step_end(self, epoch, logs=None):
+        """Function to be called at the end of every epoch
+        Every ``check_freq`` number of epochs, the parameters of the model will
+        be stored in the indicated directory.
+        """
+        if ((epoch + 1) % self.check_freq) == 0:
+            pdf_replicas = self.pdf_model.split_replicas()
+            for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs):
+                weights = replica_model.trainable_weights
+                self._save_weights(epoch + 1, weights, weight_dir)
+
+    def on_train_end(self, logs=None):
+        """Store the best parameters"""
+        for idx, weight_dir in enumerate(self.weight_dirs):
+            weights = self.stopping_object._best_weights[idx]
+            if weights is not None:
+                best_weights = weights['all_NNs']
+                best_epoch = self.stopping_object._best_epochs[idx]
+                self._save_weights(best_epoch, best_weights, weight_dir)
+            else:
+                log.warning(
+                    f"No best weights found for replica {idx+1}, skipping saving best parameters."
+                )
+
+
 def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0):
     """
     Generate tensorboard logging details at ``log_dir``.

diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
@@ -220,7 +220,9 @@
 
 
 class WriterWrapper:
-    def __init__(self, replica_numbers, pdf_objects, stopping_object, all_chi2s, theory, timings, trials):
+    def __init__(
+        self, replica_numbers, pdf_objects, stopping_object, all_chi2s, theory, timings, trials
+    ):
         """
         Initializes the writer for all replicas.
 
@@ -298,18 +300,18 @@ def _hyperparam_settings(self, replica_number):
             trials_number = self.trials["number_of_trials"]
             idx_trial = replica_number % trials_number
             hyperparam_info = {}
-            hyperparam_info["optimizer"]=self.trials["optimizer"][idx_trial]
-            hyperparam_info["learning_rate"]=self.trials["learning_rate"][idx_trial]
-            hyperparam_info["clipnorm"]=self.trials["clipnorm"][idx_trial]
-            hyperparam_info["epochs"]=self.trials["epochs"][idx_trial]
-            hyperparam_info["stopping_patience"]=self.trials["stopping_patience"][idx_trial]
-            hyperparam_info["initial"]=self.trials["initial"][idx_trial]
-            hyperparam_info["nodes_per_layer"]=self.trials["nodes_per_layer"][idx_trial]
-            hyperparam_info["number_of_layers"]=self.trials["number_of_layers"][idx_trial]
-            hyperparam_info["activation"]=self.trials["activation_per_layer"][idx_trial]
-            hyperparam_info["layer_type"]=self.trials["layer_type"][idx_trial]
-            hyperparam_info["initializer"]=self.trials["initializer"][idx_trial]
-            hyperparam_info["dropout"]=self.trials["dropout"][idx_trial]
+            hyperparam_info["optimizer"] = self.trials["optimizer"][idx_trial]
+            hyperparam_info["learning_rate"] = self.trials["learning_rate"][idx_trial]
+            hyperparam_info["clipnorm"] = self.trials["clipnorm"][idx_trial]
+            hyperparam_info["epochs"] = self.trials["epochs"][idx_trial]
+            hyperparam_info["stopping_patience"] = self.trials["stopping_patience"][idx_trial]
+            hyperparam_info["initial"] = self.trials["initial"][idx_trial]
+            hyperparam_info["nodes_per_layer"] = self.trials["nodes_per_layer"][idx_trial]
+            hyperparam_info["number_of_layers"] = self.trials["number_of_layers"][idx_trial]
+            hyperparam_info["activation"] = self.trials["activation_per_layer"][idx_trial]
+            hyperparam_info["layer_type"] = self.trials["layer_type"][idx_trial]
+            hyperparam_info["initializer"] = self.trials["initializer"][idx_trial]
+            hyperparam_info["dropout"] = self.trials["dropout"][idx_trial]
             return hyperparam_info
         else:
             hyperparam_info = "from runcard"
@@ -329,6 +331,11 @@ def _write_metadata_json(self, i, replica_number, out_path):
             # Note: the 2 arguments below are the same for all replicas, unless run separately
             timing=self.timings,
             stop_epoch=self.stopping_object.stop_epoch,
+            would_stop_epoch=(
+                self.stopping_object.would_stop_epoch
+                if self.stopping_object._dont_stop
+                else self.stopping_object.stop_epoch
+            ),
         )
 
         with open(out_path, "w", encoding="utf-8") as fs:
@@ -373,6 +380,7 @@ def jsonfit(
     true_chi2,
     stop_epoch,
     timing,
+    would_stop_epoch,
     hyperparam_info,
 ):
     """Generates a dictionary containing all relevant metadata for the fit
@@ -399,7 +407,9 @@ def jsonfit(
             epoch at which the stopping stopped (not the one for the best fit!)
         timing: dict
             dictionary of the timing of the different events that happened
-        hyperparam_info: dict 
+        would_stop_epoch: int
+            epoch at which the stopping would have stopped if it were not set to "dont_stop"
+        hyperparam_info: dict
             dictionary of hyperparameter settings
     """
     all_info = {}
@@ -415,6 +425,7 @@ def jsonfit(
     all_info["arc_lengths"] = arc_lengths
     all_info["integrability"] = integrability_numbers
     all_info["timing"] = timing
+    all_info["would_stop_epoch"] = would_stop_epoch
     all_info["hyperparameters"] = hyperparam_info
     # Versioning info
     all_info["version"] = version()

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
@@ -347,6 +347,13 @@ class ReplicaSettings:
             e.g. ``dense`` or ``dense_per_flavour``
         initializer: str
             initializer to be used for this replica
+        initializer_scale: float
+            width multiplier for the initializer distribution. Only affects ``glorot_normal``
+            (weight std scales as sqrt(scale)); 1.0 reproduces standard glorot_normal
+        initializer_gamma: float
+            exponent on the initializer variance: ``variance = (scale/fan)**gamma``
+            (``std = (scale/fan)**(gamma/2)``). Only affects ``glorot_normal``; 1.0
+            reproduces standard glorot_normal
         dropout: float
             rate of dropout for each layer
         regularizer: str
@@ -360,6 +367,8 @@ class ReplicaSettings:
     activations: list[str]
     architecture: str = "dense"
     initializer: str = "glorot_normal"
+    initializer_scale: float = 1.0
+    initializer_gamma: float = 1.0
     dropout_rate: float = 0.0
     regularizer: str = None
     regularizer_args: dict = field(default_factory=dict)
@@ -806,6 +815,8 @@ def _generate_nn(
     activations: list[str] = None,
     architecture: str = "dense",
     initializer: str = None,
+    initializer_scale: float = 1.0,
+    initializer_gamma: float = 1.0,
     dropout_rate: float = 0.0,
     regularizer: str = None,
     regularizer_args: dict = field(default_factory=dict),
@@ -848,7 +859,9 @@ def layer_generator(i_layer, nodes_out, activation):
             """Generate the ``i_layer``-th dense_per_flavour layer for all replicas."""
             l_seed = int(seed + i_layer * n_flavours)
             initializers = [
-                MetaLayer.select_initializer(initializer, seed=l_seed + b)
+                MetaLayer.select_initializer(
+                    initializer, seed=l_seed + b, scale=initializer_scale, gamma=initializer_gamma
+                )
                 for b in range(n_flavours)
             ]
             layer = base_layer_selector(
@@ -863,7 +876,12 @@ def layer_generator(i_layer, nodes_out, activation):
     elif architecture == "dense":
 
         def layer_generator(i_layer, nodes_out, activation):
-            kini = MetaLayer.select_initializer(initializer, seed=int(seed + i_layer))
+            kini = MetaLayer.select_initializer(
+                initializer,
+                seed=int(seed + i_layer),
+                scale=initializer_scale,
+                gamma=initializer_gamma,
+            )
             return base_layer_selector(
                 architecture,
                 kernel_initializer=kini,