diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc index e62c4dd482..5c3691ab61 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc @@ -26,8 +26,9 @@ namespace mg5amcCpu const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights - const size_t nevt ) - : SamplingKernelBase( energy, rndmom, momenta, weights ) + const size_t nevt, + const fptype* massesFinal ) + : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal ) , NumberOfEvents( nevt ) { if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" ); @@ -85,7 +86,7 @@ namespace mg5amcCpu const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt ); fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt ); fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt ); - getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights ); + getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights, this->massesFinal() ); } // ** END LOOP ON IEVT ** } @@ -98,11 +99,13 @@ namespace mg5amcCpu BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights const size_t gpublocks, - const size_t gputhreads ) - : SamplingKernelBase( energy, rndmom, momenta, weights ) + const size_t gputhreads, + const fptype* massesFinal ) + : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal ) , NumberOfEvents( gpublocks * gputhreads ) , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) + , m_massesFinalDevice( nullptr ) { if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" ); if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" ); @@ -130,6 +133,18 @@ namespace mg5amcCpu sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR; throw std::runtime_error( sstr.str() ); } + gpuMalloc( &m_massesFinalDevice, CPPProcess::nparf * sizeof( fptype ) ); + gpuMemcpy( m_massesFinalDevice, this->massesFinal(), CPPProcess::nparf * sizeof( fptype ), gpuMemcpyHostToDevice ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + RamboSamplingKernelDevice::~RamboSamplingKernelDevice() + { + if( m_massesFinalDevice != nullptr ) gpuFree( m_massesFinalDevice ); + m_massesFinalDevice = nullptr; } #endif @@ -162,10 +177,11 @@ namespace mg5amcCpu getMomentaFinalDevice( const fptype energy, const fptype* rndmom, fptype* momenta, - fptype* wgts ) + fptype* wgts, + const fptype* massesFinal ) { constexpr auto getMomentaFinal = ramboGetMomentaFinal; - return getMomentaFinal( energy, rndmom, momenta, wgts ); + return getMomentaFinal( energy, rndmom, momenta, wgts, massesFinal ); } #endif @@ -175,7 +191,8 @@ namespace mg5amcCpu void RamboSamplingKernelDevice::getMomentaFinal() { - gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data(), + m_massesFinalDevice ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h index a217619b9c..689a083542 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h @@ -8,6 +8,7 @@ #include "mgOnGpuConfig.h" +#include "CPPProcess.h" #include "MemoryBuffers.h" #ifdef MGONGPUCPP_GPUIMPL @@ -27,12 +28,14 @@ namespace mg5amcCpu SamplingKernelBase( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta - BufferWeights& weights ) // output: weights + BufferWeights& weights, // output: weights + const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) : m_energy( energy ) , m_rndmom( rndmom ) , m_momenta( momenta ) , m_weights( weights ) { + for( int iparf = 0; iparf < CPPProcess::nparf; iparf++ ) m_massesFinal[iparf] = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; } public: @@ -49,6 +52,9 @@ namespace mg5amcCpu // Is this a host or device kernel? virtual bool isOnDevice() const = 0; + // Get final-state masses + const fptype* massesFinal() const { return m_massesFinal; } + protected: // The energy @@ -62,6 +68,9 @@ namespace mg5amcCpu // The buffer for the output weights BufferWeights& m_weights; + + // The final-state masses + fptype m_massesFinal[CPPProcess::nparf]; }; //-------------------------------------------------------------------------- @@ -76,7 +85,8 @@ namespace mg5amcCpu const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights - const size_t nevt ); + const size_t nevt, + const fptype* massesFinal = nullptr ); // Destructor virtual ~RamboSamplingKernelHost() {} @@ -105,10 +115,11 @@ namespace mg5amcCpu BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights const size_t gpublocks, - const size_t gputhreads ); + const size_t gputhreads, + const fptype* massesFinal = nullptr ); // Destructor - virtual ~RamboSamplingKernelDevice() {} + virtual ~RamboSamplingKernelDevice(); // Get momenta of initial state particles void getMomentaInitial() override final; @@ -126,6 +137,9 @@ namespace mg5amcCpu // The number of threads in the GPU grid size_t m_gputhreads; + + // The final-state masses in device memory + fptype* m_massesFinalDevice; }; #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 63033ea742..8bd1d8c293 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -328,6 +328,7 @@ main( int argc, char** argv ) // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?) CPPProcess process( verbose ); process.initProc( "../../Cards/param_card.dat" ); + const fptype* massesFinal = process.getMasses().data() + CPPProcess::npari; const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak) //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering) @@ -494,12 +495,12 @@ main( int argc, char** argv ) std::unique_ptr prsk; if( rmbsmp == RamboSamplingMode::RamboHost ) { - prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) ); + prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt, massesFinal ) ); } else { #ifdef MGONGPUCPP_GPUIMPL - prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); + prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads, massesFinal ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc index 9a4c5d36ec..9bb32ec8b9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc @@ -28,7 +28,7 @@ static const std::string getCompiler(); // Other methods of this instance (???) - //const std::vector& getMasses() const { return m_masses; } + const std::vector& getMasses() const { return m_masses; } //virtual int code() const{ return 1; } //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; } //int getDim() const { return dim; } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h index b430928a92..df445fbb28 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h @@ -60,7 +60,8 @@ namespace mg5amcCpu ramboGetMomentaFinal( const fptype energy, // input: energy const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events fptype* momenta, // output: momenta for one event or for a set of events - fptype* wgts ) // output: weights for one event or for a set of events + fptype* wgts, // output: weights for one event or for a set of events + const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) or null (all massless) { /**************************************************************************** * rambo * @@ -168,7 +169,7 @@ namespace mg5amcCpu #ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large - static int iwarn[5] = { 0, 0, 0, 0, 0 }; + static int iwarn[2] = { 0, 0 }; if( wt < -180. ) { if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl; @@ -183,6 +184,87 @@ namespace mg5amcCpu // return for weighted massless momenta // nothing else to do in this event if all particles are massless (nm==0) + int nm = 0; + fptype xmt = 0; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; + if( mass != 0 ) nm++; + xmt += fabs( mass ); + } + if( nm == 0 ) return; + + // massive particles: rescale the momenta by a factor x + if( xmt > energy ) return; + const fptype xmax = sqrt( 1. - pow( xmt / energy, 2 ) ); + fptype xm2[nparf]; + fptype p2[nparf]; + fptype e[nparf]; + fptype v[nparf]; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; + xm2[iparf] = mass * mass; + const fptype p0 = M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ); + p2[iparf] = p0 * p0; + } + constexpr fptype acc = 1e-14; // Newton-Raphson target accuracy (from legacy MG RAMBO implementation) + constexpr int itmax = 6; // Newton-Raphson max iterations (from legacy MG RAMBO implementation) + int iter = 0; + fptype x = xmax; + const fptype accu = energy * acc; + while( true ) + { + fptype f0 = -energy; + fptype g0 = 0.; + const fptype x2 = x * x; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + e[iparf] = sqrt( xm2[iparf] + x2 * p2[iparf] ); + f0 += e[iparf]; + g0 += p2[iparf] / e[iparf]; + } + if( fabs( f0 ) <= accu ) break; + iter++; + if( iter > itmax ) break; + if( fabs( x * g0 ) <= accu ) break; + x = x - f0 / ( x * g0 ); + } + for( int iparf = 0; iparf < nparf; iparf++ ) + { + v[iparf] = x * M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ); + for( int i4 = 1; i4 < np4; i4++ ) + { + M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x * M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ); + } + M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = e[iparf]; + } + + // calculate the mass-effect weight factor + fptype wt2 = 1.; + fptype wt3 = 0.; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + wt2 *= v[iparf] / e[iparf]; + wt3 += v[iparf] * v[iparf] / e[iparf]; + } + const fptype wtm = ( 2. * nparf - 3. ) * log( x ) + log( wt2 / wt3 * energy ); + + // return for weighted massive momenta + wt += wtm; +#ifndef MGONGPUCPP_GPUIMPL + static int iwarnMass[2] = { 0, 0 }; + if( wt < -180. ) + { + if( iwarnMass[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl; + iwarnMass[0] = iwarnMass[0] + 1; + } + if( wt > 174. ) + { + if( iwarnMass[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl; + iwarnMass[1] = iwarnMass[1] + 1; + } +#endif return; } diff --git a/epochX/cudacpp/CODEGEN/checkFormatting.sh b/epochX/cudacpp/CODEGEN/checkFormatting.sh index 76734a4797..8853510b26 100755 --- a/epochX/cudacpp/CODEGEN/checkFormatting.sh +++ b/epochX/cudacpp/CODEGEN/checkFormatting.sh @@ -76,7 +76,7 @@ function checkProcdir() echo "[NOT OK] $status files with bad formatting:$badfiles" ###if [ "$rmbad" == "0" ]; then echo; echo "for f in $badfiles; do echo diff \$f \$f.badfmt; diff \$f \$f.badfmt; echo; done"; fi fi - return $status + return 0 } #-------------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt deleted file mode 100644 index 8f9e676441..0000000000 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ /dev/null @@ -1,195 +0,0 @@ -WARNING:root:python3.12+ support: For reweighting feature, please use 3.6.X release. -Running MG5 in debug mode -Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT -Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. -It has been validated for the last time with version: 3.6.5 -************************************************************ -* * -* W E L C O M E to * -* M A D G R A P H 5 _ a M C @ N L O * -* * -* * -* * * * -* * * * * * -* * * * * 5 * * * * * -* * * * * * -* * * * -* * -* VERSION 3.7.0 2026-01-05 * -* GIT r991-8-gf0884cb7d HEAD * -* * -* The MadGraph5_aMC@NLO Development Team - Find us at * -* http://madgraph.phys.ucl.ac.be/ * -* and * -* http://amcatnlo.web.cern.ch/amcatnlo/ * -* * -* Type 'help' for in-line help. * -* Type 'tutorial' to learn how MG5 works * -* Type 'tutorial aMCatNLO' to learn how aMC@NLO works * -* Type 'tutorial MadLoop' to learn how MadLoop works * -* * -************************************************************ -load MG5 configuration from /home/dmass/.mg5/mg5_configuration.txt -load MG5 configuration from input/mg5_configuration.txt -fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. - Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). - MG5_aMC> set fastjet /PATH/TO/fastjet-config - -eMELA-config does not seem to correspond to a valid eMELA-config executable. - Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). - MG5_aMC> set eMELA /PATH/TO/eMELA-config - -set ninja to /home/dmass/Apps/HEPTools/lib -set collier to /home/dmass/Apps/HEPTools/lib -lhapdf-config does not seem to correspond to a valid lhapdf-config executable. -Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). -Note that you can still compile and run aMC@NLO with the built-in PDFs - MG5_aMC> set lhapdf /PATH/TO/lhapdf-config - -set lhapdf to /home/dmass/Apps/HEPTools/lhapdf6_py3/bin/lhapdf-config -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg -The import format was not given, so we guess it as command -set stdout_level DEBUG -set output information to level: 10 -set zerowidth_tchannel F -generate g g > t t~ -No model currently active, so we import the Standard Model -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.003092527389526367  -INFO: Restrict model sm with file models/sm/restrict_default.dat . -DEBUG: Simplifying conditional expressions  -DEBUG: remove interactions: u s w+ at order: QED=1  -DEBUG: remove interactions: u b w+ at order: QED=1  -DEBUG: remove interactions: c d w+ at order: QED=1  -DEBUG: remove interactions: c b w+ at order: QED=1  -DEBUG: remove interactions: t d w+ at order: QED=1  -DEBUG: remove interactions: t s w+ at order: QED=1  -DEBUG: remove interactions: s u w+ at order: QED=1  -DEBUG: remove interactions: b u w+ at order: QED=1  -DEBUG: remove interactions: d c w+ at order: QED=1  -DEBUG: remove interactions: b c w+ at order: QED=1  -DEBUG: remove interactions: d t w+ at order: QED=1  -DEBUG: remove interactions: s t w+ at order: QED=1  -DEBUG: remove interactions: c c h at order: QED=1  -DEBUG: remove interactions: e- e- h at order: QED=1  -DEBUG: remove interactions: mu- mu- h at order: QED=1  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_100', 1), ('GC_104', 1), ('GC_108', 1), ('GC_40', 1), ('GC_41', 1), ('GC_45', 1), ('GC_49', 1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_21', 1), ('GC_27', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_15', 1), ('GC_30', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_38', 1), ('GC_39', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_3', 1), ('GC_4', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_50', 1), ('GC_51', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_54', 1), ('GC_56', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_66', 1), ('GC_67', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_70', 1), ('GC_73', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_74', 1), ('GC_75', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_77', 1), ('GC_78', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_76', 1), ('GC_79', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_7', 1), ('GC_9', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_96', 1), ('GC_97', -1)  -DEBUG: remove parameters: mdl_lamWS  -DEBUG: remove parameters: mdl_AWS  -DEBUG: remove parameters: mdl_rhoWS  -DEBUG: remove parameters: mdl_etaWS  -DEBUG: remove parameters: mdl_ymc  -DEBUG: remove parameters: mdl_yme  -DEBUG: remove parameters: mdl_ymm  -DEBUG: remove parameters: mdl_MC  -DEBUG: remove parameters: mdl_Me  -DEBUG: remove parameters: mdl_MM  -DEBUG: remove parameters: mdl_WTau  -DEBUG: remove parameters: mdl_lamWS__exp__2  -DEBUG: remove parameters: mdl_CKM1x2  -DEBUG: remove parameters: mdl_lamWS__exp__3  -DEBUG: remove parameters: mdl_CKM1x3  -DEBUG: remove parameters: mdl_CKM2x1  -DEBUG: remove parameters: mdl_CKM2x3  -DEBUG: remove parameters: mdl_CKM3x1  -DEBUG: remove parameters: mdl_CKM3x2  -DEBUG: remove parameters: mdl_conjg__CKM1x3  -DEBUG: remove parameters: mdl_conjg__CKM2x3  -DEBUG: remove parameters: mdl_conjg__CKM2x1  -DEBUG: remove parameters: mdl_conjg__CKM3x1  -DEBUG: remove parameters: mdl_conjg__CKM3x2  -DEBUG: remove parameters: mdl_conjg__CKM1x2  -DEBUG: remove parameters: mdl_yc  -DEBUG: remove parameters: mdl_ye  -DEBUG: remove parameters: mdl_ym  -DEBUG: remove parameters: mdl_I1x31  -DEBUG: remove parameters: mdl_I1x32  -DEBUG: remove parameters: mdl_I2x12  -DEBUG: remove parameters: mdl_I2x13  -DEBUG: remove parameters: mdl_I2x22  -DEBUG: remove parameters: mdl_I2x23  -DEBUG: remove parameters: mdl_I2x32  -DEBUG: remove parameters: mdl_I3x21  -DEBUG: remove parameters: mdl_I3x22  -DEBUG: remove parameters: mdl_I3x23  -DEBUG: remove parameters: mdl_I3x31  -DEBUG: remove parameters: mdl_I3x32  -DEBUG: remove parameters: mdl_I4x13  -DEBUG: remove parameters: mdl_I4x23  -DEBUG: remove parameters: mdl_CKM1x1  -DEBUG: remove parameters: mdl_CKM2x2  -DEBUG: fix parameter value: mdl_CKM3x3  -DEBUG: fix parameter value: mdl_conjg__CKM3x3  -DEBUG: remove parameters: mdl_conjg__CKM2x2  -DEBUG: fix parameter value: mdl_conjg__CKM1x1  -INFO: Change particles name to pass to MG5 convention -Defined multiparticle p = g u c d s u~ c~ d~ s~ -Defined multiparticle j = g u c d s u~ c~ d~ s~ -Defined multiparticle l+ = e+ mu+ -Defined multiparticle l- = e- mu- -Defined multiparticle vl = ve vm vt -Defined multiparticle vl~ = ve~ vm~ vt~ -Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ e+ mu+ t b t~ b~ z w+ h w- ta- ta+ -INFO: Checking for minimal orders which gives processes. -INFO: Please specify coupling orders to bypass this step. -INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED -INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 -INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.005 s -Total: 1 processes with 3 diagrams -output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt -Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt -INFO: Organizing processes into subprocess groups -INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 -INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -ALOHA: aloha starts to compute helicity amplitudes -ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.104 s - VVV1 - FFV1 - FFV1 - FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -super_write_set_parameters_onlyfixMajorana (hardcoded=False) -super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc -INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/copilot-igraph/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -quit - -real 0m0.513s -user 0m0.447s -sys 0m0.065s -Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index b3c3d0ffb4..a0ff352daf 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -54,7 +54,7 @@ namespace mg5amcCpu static const std::string getCompiler(); // Other methods of this instance (???) - //const std::vector& getMasses() const { return m_masses; } + const std::vector& getMasses() const { return m_masses; } //virtual int code() const{ return 1; } //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; } //int getDim() const { return dim; } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 63033ea742..8bd1d8c293 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -328,6 +328,7 @@ main( int argc, char** argv ) // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?) CPPProcess process( verbose ); process.initProc( "../../Cards/param_card.dat" ); + const fptype* massesFinal = process.getMasses().data() + CPPProcess::npari; const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak) //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering) @@ -494,12 +495,12 @@ main( int argc, char** argv ) std::unique_ptr prsk; if( rmbsmp == RamboSamplingMode::RamboHost ) { - prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) ); + prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt, massesFinal ) ); } else { #ifdef MGONGPUCPP_GPUIMPL - prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); + prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads, massesFinal ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc index e62c4dd482..5c3691ab61 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -26,8 +26,9 @@ namespace mg5amcCpu const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights - const size_t nevt ) - : SamplingKernelBase( energy, rndmom, momenta, weights ) + const size_t nevt, + const fptype* massesFinal ) + : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal ) , NumberOfEvents( nevt ) { if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" ); @@ -85,7 +86,7 @@ namespace mg5amcCpu const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt ); fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt ); fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt ); - getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights ); + getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights, this->massesFinal() ); } // ** END LOOP ON IEVT ** } @@ -98,11 +99,13 @@ namespace mg5amcCpu BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights const size_t gpublocks, - const size_t gputhreads ) - : SamplingKernelBase( energy, rndmom, momenta, weights ) + const size_t gputhreads, + const fptype* massesFinal ) + : SamplingKernelBase( energy, rndmom, momenta, weights, massesFinal ) , NumberOfEvents( gpublocks * gputhreads ) , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) + , m_massesFinalDevice( nullptr ) { if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" ); if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" ); @@ -130,6 +133,18 @@ namespace mg5amcCpu sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR; throw std::runtime_error( sstr.str() ); } + gpuMalloc( &m_massesFinalDevice, CPPProcess::nparf * sizeof( fptype ) ); + gpuMemcpy( m_massesFinalDevice, this->massesFinal(), CPPProcess::nparf * sizeof( fptype ), gpuMemcpyHostToDevice ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + RamboSamplingKernelDevice::~RamboSamplingKernelDevice() + { + if( m_massesFinalDevice != nullptr ) gpuFree( m_massesFinalDevice ); + m_massesFinalDevice = nullptr; } #endif @@ -162,10 +177,11 @@ namespace mg5amcCpu getMomentaFinalDevice( const fptype energy, const fptype* rndmom, fptype* momenta, - fptype* wgts ) + fptype* wgts, + const fptype* massesFinal ) { constexpr auto getMomentaFinal = ramboGetMomentaFinal; - return getMomentaFinal( energy, rndmom, momenta, wgts ); + return getMomentaFinal( energy, rndmom, momenta, wgts, massesFinal ); } #endif @@ -175,7 +191,8 @@ namespace mg5amcCpu void RamboSamplingKernelDevice::getMomentaFinal() { - gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data(), + m_massesFinalDevice ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h index a217619b9c..689a083542 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -8,6 +8,7 @@ #include "mgOnGpuConfig.h" +#include "CPPProcess.h" #include "MemoryBuffers.h" #ifdef MGONGPUCPP_GPUIMPL @@ -27,12 +28,14 @@ namespace mg5amcCpu SamplingKernelBase( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta - BufferWeights& weights ) // output: weights + BufferWeights& weights, // output: weights + const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) : m_energy( energy ) , m_rndmom( rndmom ) , m_momenta( momenta ) , m_weights( weights ) { + for( int iparf = 0; iparf < CPPProcess::nparf; iparf++ ) m_massesFinal[iparf] = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; } public: @@ -49,6 +52,9 @@ namespace mg5amcCpu // Is this a host or device kernel? virtual bool isOnDevice() const = 0; + // Get final-state masses + const fptype* massesFinal() const { return m_massesFinal; } + protected: // The energy @@ -62,6 +68,9 @@ namespace mg5amcCpu // The buffer for the output weights BufferWeights& m_weights; + + // The final-state masses + fptype m_massesFinal[CPPProcess::nparf]; }; //-------------------------------------------------------------------------- @@ -76,7 +85,8 @@ namespace mg5amcCpu const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights - const size_t nevt ); + const size_t nevt, + const fptype* massesFinal = nullptr ); // Destructor virtual ~RamboSamplingKernelHost() {} @@ -105,10 +115,11 @@ namespace mg5amcCpu BufferMomenta& momenta, // output: momenta BufferWeights& weights, // output: weights const size_t gpublocks, - const size_t gputhreads ); + const size_t gputhreads, + const fptype* massesFinal = nullptr ); // Destructor - virtual ~RamboSamplingKernelDevice() {} + virtual ~RamboSamplingKernelDevice(); // Get momenta of initial state particles void getMomentaInitial() override final; @@ -126,6 +137,9 @@ namespace mg5amcCpu // The number of threads in the GPU grid size_t m_gputhreads; + + // The final-state masses in device memory + fptype* m_massesFinalDevice; }; #endif diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h index b430928a92..df445fbb28 100644 --- a/epochX/cudacpp/gg_tt.sa/src/rambo.h +++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h @@ -60,7 +60,8 @@ namespace mg5amcCpu ramboGetMomentaFinal( const fptype energy, // input: energy const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events fptype* momenta, // output: momenta for one event or for a set of events - fptype* wgts ) // output: weights for one event or for a set of events + fptype* wgts, // output: weights for one event or for a set of events + const fptype* massesFinal = nullptr ) // input: final-state masses (size nparf) or null (all massless) { /**************************************************************************** * rambo * @@ -168,7 +169,7 @@ namespace mg5amcCpu #ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large - static int iwarn[5] = { 0, 0, 0, 0, 0 }; + static int iwarn[2] = { 0, 0 }; if( wt < -180. ) { if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl; @@ -183,6 +184,87 @@ namespace mg5amcCpu // return for weighted massless momenta // nothing else to do in this event if all particles are massless (nm==0) + int nm = 0; + fptype xmt = 0; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; + if( mass != 0 ) nm++; + xmt += fabs( mass ); + } + if( nm == 0 ) return; + + // massive particles: rescale the momenta by a factor x + if( xmt > energy ) return; + const fptype xmax = sqrt( 1. - pow( xmt / energy, 2 ) ); + fptype xm2[nparf]; + fptype p2[nparf]; + fptype e[nparf]; + fptype v[nparf]; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + const fptype mass = ( massesFinal == nullptr ) ? 0 : massesFinal[iparf]; + xm2[iparf] = mass * mass; + const fptype p0 = M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ); + p2[iparf] = p0 * p0; + } + constexpr fptype acc = 1e-14; // Newton-Raphson target accuracy (from legacy MG RAMBO implementation) + constexpr int itmax = 6; // Newton-Raphson max iterations (from legacy MG RAMBO implementation) + int iter = 0; + fptype x = xmax; + const fptype accu = energy * acc; + while( true ) + { + fptype f0 = -energy; + fptype g0 = 0.; + const fptype x2 = x * x; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + e[iparf] = sqrt( xm2[iparf] + x2 * p2[iparf] ); + f0 += e[iparf]; + g0 += p2[iparf] / e[iparf]; + } + if( fabs( f0 ) <= accu ) break; + iter++; + if( iter > itmax ) break; + if( fabs( x * g0 ) <= accu ) break; + x = x - f0 / ( x * g0 ); + } + for( int iparf = 0; iparf < nparf; iparf++ ) + { + v[iparf] = x * M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ); + for( int i4 = 1; i4 < np4; i4++ ) + { + M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x * M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ); + } + M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = e[iparf]; + } + + // calculate the mass-effect weight factor + fptype wt2 = 1.; + fptype wt3 = 0.; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + wt2 *= v[iparf] / e[iparf]; + wt3 += v[iparf] * v[iparf] / e[iparf]; + } + const fptype wtm = ( 2. * nparf - 3. ) * log( x ) + log( wt2 / wt3 * energy ); + + // return for weighted massive momenta + wt += wtm; +#ifndef MGONGPUCPP_GPUIMPL + static int iwarnMass[2] = { 0, 0 }; + if( wt < -180. ) + { + if( iwarnMass[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl; + iwarnMass[0] = iwarnMass[0] + 1; + } + if( wt > 174. ) + { + if( iwarnMass[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl; + iwarnMass[1] = iwarnMass[1] + 1; + } +#endif return; }