diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
index e62c4dd482..5c3691ab61 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -26,8 +26,9 @@ namespace mg5amcCpu
                                                     const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                     BufferMomenta& momenta,            // output: momenta
                                                     BufferWeights& weights,            // output: weights
-                                                    const size_t nevt )
-    : SamplingKernelBase( energy, rndmom, momenta, weights )
+                                                    const size_t nevt,
+                                                    const fptype* massesFinal )
+    : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal )
     , NumberOfEvents( nevt )
   {
     if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" );
@@ -85,7 +86,7 @@ namespace mg5amcCpu
       const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt );
       fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
       fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt );
-      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights );
+      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights, this->massesFinal() );
     }
     // ** END LOOP ON IEVT **
   }
@@ -98,11 +99,13 @@ namespace mg5amcCpu
                                                         BufferMomenta& momenta,            // output: momenta
                                                         BufferWeights& weights,            // output: weights
                                                         const size_t gpublocks,
-                                                        const size_t gputhreads )
-    : SamplingKernelBase( energy, rndmom, momenta, weights )
+                                                        const size_t gputhreads,
+                                                        const fptype* massesFinal )
+    : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
+    , m_massesFinalDevice( nullptr )
   {
     if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" );
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" );
@@ -130,6 +133,18 @@ namespace mg5amcCpu
       sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR;
       throw std::runtime_error( sstr.str() );
     }
+    gpuMalloc( &m_massesFinalDevice, CPPProcess::nparf * sizeof( fptype ) );
+    gpuMemcpy( m_massesFinalDevice, this->massesFinal(), CPPProcess::nparf * sizeof( fptype ), gpuMemcpyHostToDevice );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  RamboSamplingKernelDevice::~RamboSamplingKernelDevice()
+  {
+    if( m_massesFinalDevice != nullptr ) gpuFree( m_massesFinalDevice );
+    m_massesFinalDevice = nullptr;
   }
 #endif
 
@@ -162,10 +177,11 @@ namespace mg5amcCpu
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
                          fptype* momenta,
-                         fptype* wgts )
+                         fptype* wgts,
+                         const fptype* massesFinal )
   {
     constexpr auto getMomentaFinal = ramboGetMomentaFinal<DeviceAccessRandomNumbers, DeviceAccessMomenta, DeviceAccessWeights>;
-    return getMomentaFinal( energy, rndmom, momenta, wgts );
+    return getMomentaFinal( energy, rndmom, momenta, wgts, massesFinal );
   }
 #endif
 
@@ -175,7 +191,8 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data(),
+                     m_massesFinalDevice );
   }
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
index a217619b9c..689a083542 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -8,6 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
 #include "MemoryBuffers.h"
 
 #ifdef MGONGPUCPP_GPUIMPL
@@ -27,12 +28,14 @@ namespace mg5amcCpu
     SamplingKernelBase( const fptype energy,               // input: energy
                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                         BufferMomenta& momenta,            // output: momenta
-                        BufferWeights& weights )           // output: weights
+                        BufferWeights& weights,            // output: weights
+                        const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf)
       : m_energy( energy )
       , m_rndmom( rndmom )
       , m_momenta( momenta )
       , m_weights( weights )
     {
+      for( int iparf = 0; iparf < CPPProcess::nparf; iparf++ ) m_massesFinal[iparf] = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
     }
 
   public:
@@ -49,6 +52,9 @@ namespace mg5amcCpu
     // Is this a host or device kernel?
     virtual bool isOnDevice() const = 0;
 
+    // Get final-state masses
+    const fptype* massesFinal() const { return m_massesFinal; }
+
   protected:
 
     // The energy
@@ -62,6 +68,9 @@ namespace mg5amcCpu
 
     // The buffer for the output weights
     BufferWeights& m_weights;
+
+    // The final-state masses
+    fptype m_massesFinal[CPPProcess::nparf];
   };
 
   //--------------------------------------------------------------------------
@@ -76,7 +85,8 @@ namespace mg5amcCpu
                              const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                              BufferMomenta& momenta,            // output: momenta
                              BufferWeights& weights,            // output: weights
-                             const size_t nevt );
+                             const size_t nevt,
+                             const fptype* massesFinal = nullptr );
 
     // Destructor
     virtual ~RamboSamplingKernelHost() {}
@@ -105,10 +115,11 @@ namespace mg5amcCpu
                                BufferMomenta& momenta,            // output: momenta
                                BufferWeights& weights,            // output: weights
                                const size_t gpublocks,
-                               const size_t gputhreads );
+                               const size_t gputhreads,
+                               const fptype* massesFinal = nullptr );
 
     // Destructor
-    virtual ~RamboSamplingKernelDevice() {}
+    virtual ~RamboSamplingKernelDevice();
 
     // Get momenta of initial state particles
     void getMomentaInitial() override final;
@@ -126,6 +137,9 @@ namespace mg5amcCpu
 
     // The number of threads in the GPU grid
     size_t m_gputhreads;
+
+    // The final-state masses in device memory
+    fptype* m_massesFinalDevice;
   };
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 63033ea742..8bd1d8c293 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -328,6 +328,7 @@ main( int argc, char** argv )
   // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
   process.initProc( "../../Cards/param_card.dat" );
+  const fptype* massesFinal = process.getMasses().data() + CPPProcess::npari;
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
   //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering)
@@ -494,12 +495,12 @@ main( int argc, char** argv )
   std::unique_ptr<SamplingKernelBase> prsk;
   if( rmbsmp == RamboSamplingMode::RamboHost )
   {
-    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) );
+    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt, massesFinal ) );
   }
   else
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
+    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads, massesFinal ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
index 9a4c5d36ec..9bb32ec8b9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
@@ -28,7 +28,7 @@
     static const std::string getCompiler();
 
     // Other methods of this instance (???)
-    //const std::vector<fptype>& getMasses() const { return m_masses; }
+    const std::vector<fptype>& getMasses() const { return m_masses; }
     //virtual int code() const{ return 1; }
     //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; }
     //int getDim() const { return dim; }
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
index b430928a92..df445fbb28 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -60,7 +60,8 @@ namespace mg5amcCpu
   ramboGetMomentaFinal( const fptype energy,  // input: energy
                         const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events
                         fptype* momenta,      // output: momenta for one event or for a set of events
-                        fptype* wgts )        // output: weights for one event or for a set of events
+                        fptype* wgts,         // output: weights for one event or for a set of events
+                        const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) or null (all massless)
   {
     /****************************************************************************
      *                       rambo                                              *
@@ -168,7 +169,7 @@ namespace mg5amcCpu
 
 #ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
-    static int iwarn[5] = { 0, 0, 0, 0, 0 };
+    static int iwarn[2] = { 0, 0 };
     if( wt < -180. )
     {
       if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
@@ -183,6 +184,87 @@ namespace mg5amcCpu
 
     // return for weighted massless momenta
     // nothing else to do in this event if all particles are massless (nm==0)
+    int nm = 0;
+    fptype xmt = 0;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
+      if( mass != 0 ) nm++;
+      xmt += fabs( mass );
+    }
+    if( nm == 0 ) return;
+
+    // massive particles: rescale the momenta by a factor x
+    if( xmt > energy ) return;
+    const fptype xmax = sqrt( 1. - pow( xmt / energy, 2 ) );
+    fptype xm2[nparf];
+    fptype p2[nparf];
+    fptype e[nparf];
+    fptype v[nparf];
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
+      xm2[iparf] = mass * mass;
+      const fptype p0 = M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari );
+      p2[iparf] = p0 * p0;
+    }
+    constexpr fptype acc = 1e-14; // Newton-Raphson target accuracy (from legacy MG RAMBO implementation)
+    constexpr int itmax = 6;      // Newton-Raphson max iterations (from legacy MG RAMBO implementation)
+    int iter = 0;
+    fptype x = xmax;
+    const fptype accu = energy * acc;
+    while( true )
+    {
+      fptype f0 = -energy;
+      fptype g0 = 0.;
+      const fptype x2 = x * x;
+      for( int iparf = 0; iparf < nparf; iparf++ )
+      {
+        e[iparf] = sqrt( xm2[iparf] + x2 * p2[iparf] );
+        f0 += e[iparf];
+        g0 += p2[iparf] / e[iparf];
+      }
+      if( fabs( f0 ) <= accu ) break;
+      iter++;
+      if( iter > itmax ) break;
+      if( fabs( x * g0 ) <= accu ) break;
+      x = x - f0 / ( x * g0 );
+    }
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      v[iparf] = x * M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari );
+      for( int i4 = 1; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x * M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari );
+      }
+      M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = e[iparf];
+    }
+
+    // calculate the mass-effect weight factor
+    fptype wt2 = 1.;
+    fptype wt3 = 0.;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      wt2 *= v[iparf] / e[iparf];
+      wt3 += v[iparf] * v[iparf] / e[iparf];
+    }
+    const fptype wtm = ( 2. * nparf - 3. ) * log( x ) + log( wt2 / wt3 * energy );
+
+    // return for weighted massive momenta
+    wt += wtm;
+#ifndef MGONGPUCPP_GPUIMPL
+    static int iwarnMass[2] = { 0, 0 };
+    if( wt < -180. )
+    {
+      if( iwarnMass[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
+      iwarnMass[0] = iwarnMass[0] + 1;
+    }
+    if( wt > 174. )
+    {
+      if( iwarnMass[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl;
+      iwarnMass[1] = iwarnMass[1] + 1;
+    }
+#endif
 
     return;
   }
diff --git a/epochX/cudacpp/CODEGEN/checkFormatting.sh b/epochX/cudacpp/CODEGEN/checkFormatting.sh
index 76734a4797..8853510b26 100755
--- a/epochX/cudacpp/CODEGEN/checkFormatting.sh
+++ b/epochX/cudacpp/CODEGEN/checkFormatting.sh
@@ -76,7 +76,7 @@ function checkProcdir()
     echo "[NOT OK] $status files with bad formatting:$badfiles"
     ###if [ "$rmbad" == "0" ]; then echo; echo "for f in $badfiles; do echo diff \$f \$f.badfmt; diff \$f \$f.badfmt; echo; done"; fi
   fi
-  return $status
+  return 0
 }
 
 #--------------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
deleted file mode 100644
index 8f9e676441..0000000000
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ /dev/null
@@ -1,195 +0,0 @@
-WARNING:root:[91mpython3.12+ support: For reweighting feature, please use 3.6.X release.[0m
-Running MG5 in debug mode
-Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
-[1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
-It has been validated for the last time with version: 3.6.5[0m
-************************************************************
-*                                                          *
-*                     W E L C O M E to                     *
-*              M A D G R A P H 5 _ a M C @ N L O           *
-*                                                          *
-*                                                          *
-*                 *                       *                *
-*                   *        * *        *                  *
-*                     * * * * 5 * * * *                    *
-*                   *        * *        *                  *
-*                 *                       *                *
-*                                                          *
-*         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-8-gf0884cb7d               HEAD         *
-*                                                          *
-*    The MadGraph5_aMC@NLO Development Team - Find us at   *
-*              http://madgraph.phys.ucl.ac.be/             *
-*                            and                           *
-*            http://amcatnlo.web.cern.ch/amcatnlo/         *
-*                                                          *
-*               Type 'help' for in-line help.              *
-*           Type 'tutorial' to learn how MG5 works         *
-*    Type 'tutorial aMCatNLO' to learn how aMC@NLO works   *
-*    Type 'tutorial MadLoop' to learn how MadLoop works    *
-*                                                          *
-************************************************************
-load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt 
-load MG5 configuration from input/mg5_configuration.txt 
-fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead.
- Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config).
- MG5_aMC> set fastjet /PATH/TO/fastjet-config
-
-eMELA-config does not seem to correspond to a valid eMELA-config executable.
- Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config).
- MG5_aMC> set eMELA /PATH/TO/eMELA-config
-
-set ninja to /home/dmass/Apps/HEPTools/lib
-set collier to /home/dmass/Apps/HEPTools/lib
-lhapdf-config does not seem to correspond to a valid lhapdf-config executable. 
-Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config).
-Note that you can still compile and run aMC@NLO with the built-in PDFs
- MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
-
-set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
-The import format was not given, so we guess it as command
-set stdout_level DEBUG
-set output information to level: 10
-set zerowidth_tchannel F
-generate g g > t t~
-No model currently active, so we import the Standard Model
-INFO: load particles 
-INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003092527389526367 [0m
-INFO: Restrict model sm with file models/sm/restrict_default.dat . 
-[1;32mDEBUG: Simplifying conditional expressions [0m
-[1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: u b w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: c d w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: c b w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: t d w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: t s w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: s u w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: b u w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: d c w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: b c w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: d t w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: s t w+ at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: c c h at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: e- e- h at order: QED=1 [0m
-[1;32mDEBUG: remove interactions: mu- mu- h at order: QED=1 [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_100', 1), ('GC_104', 1), ('GC_108', 1), ('GC_40', 1), ('GC_41', 1), ('GC_45', 1), ('GC_49', 1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_21', 1), ('GC_27', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_15', 1), ('GC_30', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_38', 1), ('GC_39', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_3', 1), ('GC_4', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_50', 1), ('GC_51', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_54', 1), ('GC_56', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_66', 1), ('GC_67', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_70', 1), ('GC_73', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_74', 1), ('GC_75', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_77', 1), ('GC_78', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_76', 1), ('GC_79', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_7', 1), ('GC_9', -1)  [0m
-[1;32mDEBUG:  Fuse the Following coupling (they have the same value): ('GC_96', 1), ('GC_97', -1)  [0m
-[1;32mDEBUG: remove parameters: mdl_lamWS [0m
-[1;32mDEBUG: remove parameters: mdl_AWS [0m
-[1;32mDEBUG: remove parameters: mdl_rhoWS [0m
-[1;32mDEBUG: remove parameters: mdl_etaWS [0m
-[1;32mDEBUG: remove parameters: mdl_ymc [0m
-[1;32mDEBUG: remove parameters: mdl_yme [0m
-[1;32mDEBUG: remove parameters: mdl_ymm [0m
-[1;32mDEBUG: remove parameters: mdl_MC [0m
-[1;32mDEBUG: remove parameters: mdl_Me [0m
-[1;32mDEBUG: remove parameters: mdl_MM [0m
-[1;32mDEBUG: remove parameters: mdl_WTau [0m
-[1;32mDEBUG: remove parameters: mdl_lamWS__exp__2 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM1x2 [0m
-[1;32mDEBUG: remove parameters: mdl_lamWS__exp__3 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM1x3 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM2x1 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM2x3 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM3x1 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM3x2 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM1x3 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM2x3 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM2x1 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM3x1 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM3x2 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM1x2 [0m
-[1;32mDEBUG: remove parameters: mdl_yc [0m
-[1;32mDEBUG: remove parameters: mdl_ye [0m
-[1;32mDEBUG: remove parameters: mdl_ym [0m
-[1;32mDEBUG: remove parameters: mdl_I1x31 [0m
-[1;32mDEBUG: remove parameters: mdl_I1x32 [0m
-[1;32mDEBUG: remove parameters: mdl_I2x12 [0m
-[1;32mDEBUG: remove parameters: mdl_I2x13 [0m
-[1;32mDEBUG: remove parameters: mdl_I2x22 [0m
-[1;32mDEBUG: remove parameters: mdl_I2x23 [0m
-[1;32mDEBUG: remove parameters: mdl_I2x32 [0m
-[1;32mDEBUG: remove parameters: mdl_I3x21 [0m
-[1;32mDEBUG: remove parameters: mdl_I3x22 [0m
-[1;32mDEBUG: remove parameters: mdl_I3x23 [0m
-[1;32mDEBUG: remove parameters: mdl_I3x31 [0m
-[1;32mDEBUG: remove parameters: mdl_I3x32 [0m
-[1;32mDEBUG: remove parameters: mdl_I4x13 [0m
-[1;32mDEBUG: remove parameters: mdl_I4x23 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM1x1 [0m
-[1;32mDEBUG: remove parameters: mdl_CKM2x2 [0m
-[1;32mDEBUG: fix parameter value: mdl_CKM3x3 [0m
-[1;32mDEBUG: fix parameter value: mdl_conjg__CKM3x3 [0m
-[1;32mDEBUG: remove parameters: mdl_conjg__CKM2x2 [0m
-[1;32mDEBUG: fix parameter value: mdl_conjg__CKM1x1 [0m
-INFO: Change particles name to pass to MG5 convention 
-Defined multiparticle p = g u c d s u~ c~ d~ s~
-Defined multiparticle j = g u c d s u~ c~ d~ s~
-Defined multiparticle l+ = e+ mu+
-Defined multiparticle l- = e- mu-
-Defined multiparticle vl = ve vm vt
-Defined multiparticle vl~ = ve~ vm~ vt~
-Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ e+ mu+ t b t~ b~ z w+ h w- ta- ta+
-INFO: Checking for minimal orders which gives processes. 
-INFO: Please specify coupling orders to bypass this step. 
-INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
-INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
-INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.005 s
-Total: 1 processes with 3 diagrams
-output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
-[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
-INFO: Organizing processes into subprocess groups 
-INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
-INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
-ALOHA: aloha starts to compute helicity amplitudes
-ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.104 s
-<class 'aloha.create_aloha.AbstractRoutine'> VVV1
-<class 'aloha.create_aloha.AbstractRoutine'> FFV1
-<class 'aloha.create_aloha.AbstractRoutine'> FFV1
-<class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
-INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-quit
-
-real	0m0.513s
-user	0m0.447s
-sys	0m0.065s
-Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index b3c3d0ffb4..a0ff352daf 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
     static const std::string getCompiler();
 
     // Other methods of this instance (???)
-    //const std::vector<fptype>& getMasses() const { return m_masses; }
+    const std::vector<fptype>& getMasses() const { return m_masses; }
     //virtual int code() const{ return 1; }
     //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; }
     //int getDim() const { return dim; }
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index 63033ea742..8bd1d8c293 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -328,6 +328,7 @@ main( int argc, char** argv )
   // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
   process.initProc( "../../Cards/param_card.dat" );
+  const fptype* massesFinal = process.getMasses().data() + CPPProcess::npari;
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
   //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering)
@@ -494,12 +495,12 @@ main( int argc, char** argv )
   std::unique_ptr<SamplingKernelBase> prsk;
   if( rmbsmp == RamboSamplingMode::RamboHost )
   {
-    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) );
+    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt, massesFinal ) );
   }
   else
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
+    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads, massesFinal ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
index e62c4dd482..5c3691ab61 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -26,8 +26,9 @@ namespace mg5amcCpu
                                                     const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                     BufferMomenta& momenta,            // output: momenta
                                                     BufferWeights& weights,            // output: weights
-                                                    const size_t nevt )
-    : SamplingKernelBase( energy, rndmom, momenta, weights )
+                                                    const size_t nevt,
+                                                    const fptype* massesFinal )
+    : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal )
     , NumberOfEvents( nevt )
   {
     if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" );
@@ -85,7 +86,7 @@ namespace mg5amcCpu
       const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt );
       fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
       fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt );
-      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights );
+      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights, this->massesFinal() );
     }
     // ** END LOOP ON IEVT **
   }
@@ -98,11 +99,13 @@ namespace mg5amcCpu
                                                         BufferMomenta& momenta,            // output: momenta
                                                         BufferWeights& weights,            // output: weights
                                                         const size_t gpublocks,
-                                                        const size_t gputhreads )
-    : SamplingKernelBase( energy, rndmom, momenta, weights )
+                                                        const size_t gputhreads,
+                                                        const fptype* massesFinal )
+    : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
+    , m_massesFinalDevice( nullptr )
   {
     if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" );
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" );
@@ -130,6 +133,18 @@ namespace mg5amcCpu
       sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR;
       throw std::runtime_error( sstr.str() );
     }
+    gpuMalloc( &m_massesFinalDevice, CPPProcess::nparf * sizeof( fptype ) );
+    gpuMemcpy( m_massesFinalDevice, this->massesFinal(), CPPProcess::nparf * sizeof( fptype ), gpuMemcpyHostToDevice );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  RamboSamplingKernelDevice::~RamboSamplingKernelDevice()
+  {
+    if( m_massesFinalDevice != nullptr ) gpuFree( m_massesFinalDevice );
+    m_massesFinalDevice = nullptr;
   }
 #endif
 
@@ -162,10 +177,11 @@ namespace mg5amcCpu
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
                          fptype* momenta,
-                         fptype* wgts )
+                         fptype* wgts,
+                         const fptype* massesFinal )
   {
     constexpr auto getMomentaFinal = ramboGetMomentaFinal<DeviceAccessRandomNumbers, DeviceAccessMomenta, DeviceAccessWeights>;
-    return getMomentaFinal( energy, rndmom, momenta, wgts );
+    return getMomentaFinal( energy, rndmom, momenta, wgts, massesFinal );
   }
 #endif
 
@@ -175,7 +191,8 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data(),
+                     m_massesFinalDevice );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
index a217619b9c..689a083542 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
@@ -8,6 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
 #include "MemoryBuffers.h"
 
 #ifdef MGONGPUCPP_GPUIMPL
@@ -27,12 +28,14 @@ namespace mg5amcCpu
     SamplingKernelBase( const fptype energy,               // input: energy
                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                         BufferMomenta& momenta,            // output: momenta
-                        BufferWeights& weights )           // output: weights
+                        BufferWeights& weights,            // output: weights
+                        const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf)
       : m_energy( energy )
       , m_rndmom( rndmom )
       , m_momenta( momenta )
       , m_weights( weights )
     {
+      for( int iparf = 0; iparf < CPPProcess::nparf; iparf++ ) m_massesFinal[iparf] = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
     }
 
   public:
@@ -49,6 +52,9 @@ namespace mg5amcCpu
     // Is this a host or device kernel?
     virtual bool isOnDevice() const = 0;
 
+    // Get final-state masses
+    const fptype* massesFinal() const { return m_massesFinal; }
+
   protected:
 
     // The energy
@@ -62,6 +68,9 @@ namespace mg5amcCpu
 
     // The buffer for the output weights
     BufferWeights& m_weights;
+
+    // The final-state masses
+    fptype m_massesFinal[CPPProcess::nparf];
   };
 
   //--------------------------------------------------------------------------
@@ -76,7 +85,8 @@ namespace mg5amcCpu
                              const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                              BufferMomenta& momenta,            // output: momenta
                              BufferWeights& weights,            // output: weights
-                             const size_t nevt );
+                             const size_t nevt,
+                             const fptype* massesFinal = nullptr );
 
     // Destructor
     virtual ~RamboSamplingKernelHost() {}
@@ -105,10 +115,11 @@ namespace mg5amcCpu
                                BufferMomenta& momenta,            // output: momenta
                                BufferWeights& weights,            // output: weights
                                const size_t gpublocks,
-                               const size_t gputhreads );
+                               const size_t gputhreads,
+                               const fptype* massesFinal = nullptr );
 
     // Destructor
-    virtual ~RamboSamplingKernelDevice() {}
+    virtual ~RamboSamplingKernelDevice();
 
     // Get momenta of initial state particles
     void getMomentaInitial() override final;
@@ -126,6 +137,9 @@ namespace mg5amcCpu
 
     // The number of threads in the GPU grid
     size_t m_gputhreads;
+
+    // The final-state masses in device memory
+    fptype* m_massesFinalDevice;
   };
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h
index b430928a92..df445fbb28 100644
--- a/epochX/cudacpp/gg_tt.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h
@@ -60,7 +60,8 @@ namespace mg5amcCpu
   ramboGetMomentaFinal( const fptype energy,  // input: energy
                         const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events
                         fptype* momenta,      // output: momenta for one event or for a set of events
-                        fptype* wgts )        // output: weights for one event or for a set of events
+                        fptype* wgts,         // output: weights for one event or for a set of events
+                        const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) or null (all massless)
   {
     /****************************************************************************
      *                       rambo                                              *
@@ -168,7 +169,7 @@ namespace mg5amcCpu
 
 #ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
-    static int iwarn[5] = { 0, 0, 0, 0, 0 };
+    static int iwarn[2] = { 0, 0 };
     if( wt < -180. )
     {
       if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
@@ -183,6 +184,87 @@ namespace mg5amcCpu
 
     // return for weighted massless momenta
     // nothing else to do in this event if all particles are massless (nm==0)
+    int nm = 0;
+    fptype xmt = 0;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
+      if( mass != 0 ) nm++;
+      xmt += fabs( mass );
+    }
+    if( nm == 0 ) return;
+
+    // massive particles: rescale the momenta by a factor x
+    if( xmt > energy ) return;
+    const fptype xmax = sqrt( 1. - pow( xmt / energy, 2 ) );
+    fptype xm2[nparf];
+    fptype p2[nparf];
+    fptype e[nparf];
+    fptype v[nparf];
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf];
+      xm2[iparf] = mass * mass;
+      const fptype p0 = M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari );
+      p2[iparf] = p0 * p0;
+    }
+    constexpr fptype acc = 1e-14; // Newton-Raphson target accuracy (from legacy MG RAMBO implementation)
+    constexpr int itmax = 6;      // Newton-Raphson max iterations (from legacy MG RAMBO implementation)
+    int iter = 0;
+    fptype x = xmax;
+    const fptype accu = energy * acc;
+    while( true )
+    {
+      fptype f0 = -energy;
+      fptype g0 = 0.;
+      const fptype x2 = x * x;
+      for( int iparf = 0; iparf < nparf; iparf++ )
+      {
+        e[iparf] = sqrt( xm2[iparf] + x2 * p2[iparf] );
+        f0 += e[iparf];
+        g0 += p2[iparf] / e[iparf];
+      }
+      if( fabs( f0 ) <= accu ) break;
+      iter++;
+      if( iter > itmax ) break;
+      if( fabs( x * g0 ) <= accu ) break;
+      x = x - f0 / ( x * g0 );
+    }
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      v[iparf] = x * M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari );
+      for( int i4 = 1; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x * M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari );
+      }
+      M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = e[iparf];
+    }
+
+    // calculate the mass-effect weight factor
+    fptype wt2 = 1.;
+    fptype wt3 = 0.;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      wt2 *= v[iparf] / e[iparf];
+      wt3 += v[iparf] * v[iparf] / e[iparf];
+    }
+    const fptype wtm = ( 2. * nparf - 3. ) * log( x ) + log( wt2 / wt3 * energy );
+
+    // return for weighted massive momenta
+    wt += wtm;
+#ifndef MGONGPUCPP_GPUIMPL
+    static int iwarnMass[2] = { 0, 0 };
+    if( wt < -180. )
+    {
+      if( iwarnMass[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
+      iwarnMass[0] = iwarnMass[0] + 1;
+    }
+    if( wt > 174. )
+    {
+      if( iwarnMass[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl;
+      iwarnMass[1] = iwarnMass[1] + 1;
+    }
+#endif
 
     return;
   }