diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 4bf46230f..b06f507bc 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -52,12 +52,24 @@ The methods in the source file are: so these methods are used to describe this. * The number of bytes read and written and the number of floating point operations (FLOPS) performed for each kernel execution. - * The consistency of the checksums of the kernel. The possible values are - ``Consistent`` where all the variant tunings always get the same checksum, - ``ConsistentPerVariantTuning`` where an individual variant tuning always - gets the same checksum but different variant tunings may differ - slightly, and ``Inconsistent`` where the checksum of a variant tuning - may vary slightly run to run. + * The consistency of the checksums of the kernel. If the kernel + always produces the same checksum value for all variant tunings then the + checksums are ``Consistent``. Most kernels get a different but consistent + checksum for each variant tuning so the checksums are + ``ConsistentPerVariantTuning``. On the other hand, some kernels have + variant tunings that get different checksums on each run of that variant + tuning, for example due to the ordering of floating-point atomic add + operations, so the checksums are ``Inconsistent``. + * The tolerance of the checksums of the kernel. A number of predefined + values are available in the ``KernelBase\:\:ChecksumTolerance`` class. If + the kernel consistently produces the same checksums then ``zero`` tolerance + is used. Most kernels use the ``normal`` tolerance. Some kernels are very + simple, for example they have a single floating-point operation per + iteration, so they use the ``tight`` tolerance. + * The scale factor to use with the checksums of the kernel. This is an + arbitrary multiplier on the checksum values used to scale the checksums + to a desired range. Mostly used for kernels with floating-point + operation complexity that does not scale linearly with problem size. * The operational complexity of the kernel. * Which RAJA features the kernel exercises. * Adding Suite variants and tunings via ``addVariantTunings``. This calls diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp index 82e825562..f2f6a3086 100644 --- a/src/algorithm/ATOMIC.cpp +++ b/src/algorithm/ATOMIC.cpp @@ -34,6 +34,7 @@ ATOMIC::ATOMIC(const RunParams& params) setFLOPsPerRep(getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::Inconsistent); // atomics + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 27325a0dc..153aed3f2 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -41,6 +41,7 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setFLOPsPerRep( (std::is_floating_point_v ? 1 : 0) * getActualProblemSize() ); setChecksumConsistency(ChecksumConsistency::Consistent); // integer arithmetic + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index 16142c9ee..2f5580a2e 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -34,6 +34,7 @@ MEMCPY::MEMCPY(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 9b1466b00..e07c348ce 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -34,6 +34,7 @@ MEMSET::MEMSET(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 68efa8e79..22c5be606 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -34,6 +34,7 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params) setFLOPsPerRep(getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::Inconsistent); // Reduction may use atomics + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 587c03b82..7d3d784f4 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -33,12 +33,12 @@ SCAN::SCAN(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - checksum_scale_factor = 1e-2 * + setChecksumConsistency(ChecksumConsistency::Inconsistent); // could depend on scheduling, this may be overly conservative + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(1e-2 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ) / - getActualProblemSize(); - - setChecksumConsistency(ChecksumConsistency::Inconsistent); // could depend on scheduling, this may be overly conservative + getActualProblemSize()); setComplexity(Complexity::N); @@ -59,7 +59,7 @@ void SCAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void SCAN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor, vid); + checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), vid); } void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 6e011ad27..31a09712d 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -33,7 +33,8 @@ SORT::SORT(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); - setChecksumConsistency(ChecksumConsistency::Consistent); // // sort is not stable but values are equal if equivalent + setChecksumConsistency(ChecksumConsistency::Consistent); // sort is not stable but values are equal if equivalent + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_logN); diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index ca5407481..37f18cc26 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -34,6 +34,7 @@ SORTPAIRS::SORTPAIRS(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Inconsistent); // sort is not stable and could depend on scheduling + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N_logN); diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 8ad78034d..88b57ff6f 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -52,6 +52,7 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) )); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 4dee604a3..8c0019148 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -44,6 +44,7 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setFLOPsPerRep(54 * m_domain->n_real_zones); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index ce71abe98..b2718d2dc 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -53,6 +53,7 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) 3 * DPA_D1D * DPA_D1D * DPA_D1D)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 939d9c0f9..612eacdd9 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -58,11 +58,11 @@ EDGE3D::EDGE3D(const RunParams& params) setFLOPsPerRep(number_of_elements * flops_per_element); - m_checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -92,7 +92,7 @@ void EDGE3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void EDGE3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_sum, m_array_length, m_checksum_scale_factor, vid ); + checksum[vid][tune_idx] += calcChecksum(m_sum, m_array_length, vid ); } void EDGE3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index 10dc47bd3..51d2b269a 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -441,8 +441,6 @@ class EDGE3D : public KernelBase ADomain* m_domain; Index_type m_array_length; - - Real_type m_checksum_scale_factor; }; } // end namespace apps diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index bc2ac2b17..270cb8784 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -54,6 +54,7 @@ ENERGY::ENERGY(const RunParams& params) ) * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/FEMSWEEP.cpp b/src/apps/FEMSWEEP.cpp index e24344739..032f1abec 100644 --- a/src/apps/FEMSWEEP.cpp +++ b/src/apps/FEMSWEEP.cpp @@ -65,9 +65,9 @@ FEMSWEEP::FEMSWEEP(const RunParams& params) m_ne * m_na * m_ng ); // for all elements, angles, and groups // The checksum is inaccurate starting at the 10's digit for: AMD CPU and older clang versions on NVIDIA GPUs. - checksum_scale_factor = 0.0000000001; - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0000000001); setComplexity(Complexity::N); @@ -106,7 +106,7 @@ void FEMSWEEP::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FEMSWEEP::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_Xdat, m_Xlen, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_Xdat, m_Xlen, vid); } void FEMSWEEP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index a359327c0..95cc0777b 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -36,11 +36,11 @@ FIR::FIR(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize()); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -61,7 +61,7 @@ void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FIR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), vid); } void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/INTSC_HEXHEX.cpp b/src/apps/INTSC_HEXHEX.cpp index 7c4d01852..31ee77230 100644 --- a/src/apps/INTSC_HEXHEX.cpp +++ b/src/apps/INTSC_HEXHEX.cpp @@ -73,6 +73,7 @@ INTSC_HEXHEX::INTSC_HEXHEX(const RunParams& params) setFLOPsPerRep(n_std_intsc * flops_per_intsc); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/INTSC_HEXRECT.cpp b/src/apps/INTSC_HEXRECT.cpp index 35a52cc87..fb5cd0853 100644 --- a/src/apps/INTSC_HEXRECT.cpp +++ b/src/apps/INTSC_HEXRECT.cpp @@ -81,6 +81,7 @@ INTSC_HEXRECT::INTSC_HEXRECT(const RunParams& params) setFLOPsPerRep(n_intsc * flops_per_intsc); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 9d8e193f9..c5640e8f8 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -49,11 +49,11 @@ LTIMES::LTIMES(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -77,7 +77,7 @@ void LTIMES::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void LTIMES::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, vid); } void LTIMES::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 59f350dad..ba842cfe2 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -49,11 +49,11 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -76,7 +76,7 @@ void LTIMES_NOVIEW::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, vid); } void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 0155be304..7ef7cce6e 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -45,6 +45,7 @@ MASS3DEA::MASS3DEA(const RunParams& params) setFLOPsPerRep(m_NE * 7 * ea_mat_entries); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 55eea8090..d348699c5 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -49,6 +49,7 @@ MASS3DPA::MASS3DPA(const RunParams& params) 2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/MASSVEC3DPA.cpp b/src/apps/MASSVEC3DPA.cpp index 9759d5a96..fad4d4405 100644 --- a/src/apps/MASSVEC3DPA.cpp +++ b/src/apps/MASSVEC3DPA.cpp @@ -56,6 +56,7 @@ MASSVEC3DPA::MASSVEC3DPA(const RunParams ¶ms) 2 * MVPA_Q1D * MVPA_D1D * MVPA_D1D * MVPA_D1D)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index 8f98d6c4e..db6ca83a4 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -79,11 +79,11 @@ MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params) const size_t adds = 26; setFLOPsPerRep((multiplies + adds) * getItsPerRep()); - checksum_scale_factor = 1.0 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(1.0 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -138,7 +138,7 @@ void MATVEC_3D_STENCIL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void MATVEC_3D_STENCIL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, checksum_scale_factor , vid); + checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, vid); } void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 1f09d4c22..2bcc0a8ba 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -45,11 +45,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes ); setFLOPsPerRep(9 * getItsPerRep()); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -77,7 +77,7 @@ void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor , vid); + checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, vid); } void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 6cded3f57..f01c83830 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -38,6 +38,7 @@ PRESSURE::PRESSURE(const RunParams& params) ) * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index d541622bb..46c6b48af 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -43,11 +43,11 @@ VOL3D::VOL3D(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz)); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -79,7 +79,7 @@ void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void VOL3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, vid); } void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 2f6f75975..f29d250fc 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -45,11 +45,11 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(8 * getItsPerRep()); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -76,7 +76,7 @@ void ZONAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune void ZONAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tune_idx) += calcChecksum(m_vol, m_zonal_array_length, checksum_scale_factor , vid); + checksum[vid].at(tune_idx) += calcChecksum(m_vol, m_zonal_array_length, vid); } void ZONAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 0db7901ac..05d16acde 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -36,6 +36,7 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setFLOPsPerRep(m_array_size * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 5d27573d2..4deb900b2 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -34,6 +34,7 @@ COPY8::COPY8(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 0a596a005..ae64804f7 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -34,6 +34,7 @@ DAXPY::DAXPY(const RunParams& params) setFLOPsPerRep(2 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 97129f832..005254c19 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -34,6 +34,7 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params) setFLOPsPerRep(2 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp index 8787d821c..d58ca4aae 100644 --- a/src/basic/EMPTY.cpp +++ b/src/basic/EMPTY.cpp @@ -34,6 +34,7 @@ EMPTY::EMPTY(const RunParams& params) setFLOPsPerRep( 0 ); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 21c9e2710..4749c7378 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -35,11 +35,11 @@ IF_QUAD::IF_QUAD(const RunParams& params) setFLOPsPerRep(4 * getActualProblemSize() + 7 * getActualProblemSize() / 2); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -63,8 +63,8 @@ void IF_QUAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), vid); + checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), vid); } void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index ed923d13f..982b9489b 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -36,6 +36,7 @@ INDEXLIST::INDEXLIST(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 3d9a5a6f4..e93069f19 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -44,6 +44,7 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index da3a02663..ae348b335 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -34,6 +34,7 @@ INIT3::INIT3(const RunParams& params) setFLOPsPerRep(1 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 9fef02ced..c823de7dc 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -33,7 +33,8 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 1e594cbc6..832b07901 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -33,7 +33,8 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 4844504c7..9f34ff94c 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -38,11 +38,11 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setFLOPsPerRep(2 * TL_SZ * TL_SZ * TL_SZ * num_tiles * num_tiles * num_tiles); - checksum_scale_factor = 1e-6 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(1e-6 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N_to_the_three_halves); @@ -62,7 +62,7 @@ void MAT_MAT_SHARED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, vid); } void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index c2afe48ca..47ce20552 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -33,7 +33,8 @@ MULADDSUB::MULADDSUB(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 9e7daf4c9..e0cf00156 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -42,6 +42,7 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setFLOPsPerRep(1 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 29f6db466..7932e2c5b 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -44,7 +44,8 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index b1be49296..e3ce8afe7 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -34,6 +34,7 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params) setFLOPsPerRep(6 * getActualProblemSize() + 1); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index b6507ee6a..ae970205d 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -34,6 +34,7 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setFLOPsPerRep(6 * getActualProblemSize() + 1); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index e3bfda3d2..72ff383ac 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -40,6 +40,7 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 2217a0b91..a00870eeb 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -40,6 +40,7 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setFLOPsPerRep(2 * getActualProblemSize() + 2); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index d921acb7e..3bc269f4a 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -34,6 +34,7 @@ TRAP_INT::TRAP_INT(const RunParams& params) setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp index b9e410bc3..860ac8687 100644 --- a/src/comm/HALO_EXCHANGE.cpp +++ b/src/comm/HALO_EXCHANGE.cpp @@ -48,6 +48,7 @@ HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_to_the_two_thirds); diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp index 907fcfd5c..3a06e0312 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -48,6 +48,7 @@ HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_to_the_two_thirds); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index ca7d7ee9f..bf090a01c 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -38,6 +38,7 @@ HALO_PACKING::HALO_PACKING(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_to_the_two_thirds); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index 978031419..55002d46c 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -38,6 +38,7 @@ HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_to_the_two_thirds); diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp index 9be78ecf3..a80c873a5 100644 --- a/src/comm/HALO_SENDRECV.cpp +++ b/src/comm/HALO_SENDRECV.cpp @@ -38,6 +38,7 @@ HALO_SENDRECV::HALO_SENDRECV(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N_to_the_two_thirds); diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index c0674a37e..b0b64bb36 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -575,16 +575,15 @@ void initData(Real_type& d) * Calculate and return checksum for data arrays. */ template < typename Data_getter > -long double calcChecksumImpl(Data_getter data, Size_type len, - Real_type scale_factor) +Checksum_type calcChecksumImpl(Data_getter data, Size_type len) { - long double tchk = 0.0; - long double ckahan = 0.0; + Checksum_type tchk = 0.0; + Checksum_type ckahan = 0.0; for (Size_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * data(j); - long double y = x - ckahan; - volatile long double t = tchk + y; - volatile long double z = t - tchk; + Checksum_type x = (std::abs(std::sin(j+1.0))+0.5) * data(j); + Checksum_type y = x - ckahan; + volatile Checksum_type t = tchk + y; + volatile Checksum_type z = t - tchk; ckahan = z - y; tchk = t; #if 0 // RDH DEBUG @@ -593,40 +592,35 @@ long double calcChecksumImpl(Data_getter data, Size_type len, } #endif } - tchk *= scale_factor; return tchk; } -long double calcChecksum(Int_ptr ptr, Size_type len, - Real_type scale_factor) +Checksum_type calcChecksum(Int_ptr ptr, Size_type len) { return calcChecksumImpl([=](Size_type j) { - return static_cast(ptr[j]); - }, len, scale_factor); + return static_cast(ptr[j]); + }, len); } -long double calcChecksum(unsigned long long* ptr, Size_type len, - Real_type scale_factor) +Checksum_type calcChecksum(unsigned long long* ptr, Size_type len) { return calcChecksumImpl([=](Size_type j) { - return static_cast(ptr[j]); - }, len, scale_factor); + return static_cast(ptr[j]); + }, len); } -long double calcChecksum(Real_ptr ptr, Size_type len, - Real_type scale_factor) +Checksum_type calcChecksum(Real_ptr ptr, Size_type len) { return calcChecksumImpl([=](Size_type j) { - return static_cast(ptr[j]); - }, len, scale_factor); + return static_cast(ptr[j]); + }, len); } -long double calcChecksum(Complex_ptr ptr, Size_type len, - Real_type scale_factor) +Checksum_type calcChecksum(Complex_ptr ptr, Size_type len) { return calcChecksumImpl([=](Size_type j) { - return static_cast(real(ptr[j])+imag(ptr[j])); - }, len, scale_factor); + return static_cast(real(ptr[j])+imag(ptr[j])); + }, len); } } // closing brace for detail namespace diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index bfa34efa9..fe805c262 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -151,17 +151,13 @@ void initData(Real_type& d); * * Checksumn is multiplied by given scale factor. */ -long double calcChecksum(Int_ptr d, Size_type len, - Real_type scale_factor); +Checksum_type calcChecksum(Int_ptr d, Size_type len); /// -long double calcChecksum(unsigned long long* d, Size_type len, - Real_type scale_factor); +Checksum_type calcChecksum(unsigned long long* d, Size_type len); /// -long double calcChecksum(Real_ptr d, Size_type len, - Real_type scale_factor); +Checksum_type calcChecksum(Real_ptr d, Size_type len); /// -long double calcChecksum(Complex_ptr d, Size_type len, - Real_type scale_factor); +Checksum_type calcChecksum(Complex_ptr d, Size_type len); } // closing brace for detail namespace @@ -385,8 +381,7 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type le * Calculate and return checksum for arrays. */ template -inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size_type align, - Real_type scale_factor) +inline Checksum_type calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size_type align) { T* check_ptr = ptr; T* copied_ptr = nullptr; @@ -400,7 +395,7 @@ inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size check_ptr = copied_ptr; } - auto val = detail::calcChecksum(check_ptr, len, scale_factor); + Checksum_type val = detail::calcChecksum(check_ptr, len); if (check_dataSpace != dataSpace) { deallocData(check_dataSpace, copied_ptr); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index e90ebd149..72beeb4c3 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -723,23 +723,27 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name) kernel->execute(vid, tune_idx); // Execute kernel if ( run_params.showProgress() ) { - getCout() << " -- " << kernel->getLastTime() / kernel->getRunReps() << " sec. x " << kernel->getRunReps() << " rep."; - - size_t prec = 20; - const auto default_precision = getCout().precision(); - Checksum_type checksum = kernel->getChecksum(vid, tune_idx); + Checksum_type cksum_tol = kernel->getChecksumTolerance(); + Checksum_type cksum_ref = kernel->getReferenceChecksum(); + Checksum_type cksum = kernel->getChecksum(vid, tune_idx); + Checksum_type cksum_diff = std::abs(cksum_ref - cksum); #if defined(RAJA_PERFSUITE_ENABLE_MPI) { - Checksum_type checksum_sum = 0; - Allreduce(&checksum, &checksum_sum, 1, MPI_SUM, MPI_COMM_WORLD); - checksum = checksum_sum / num_ranks; + Checksum_type cksum_diff_max = 1e80; + Allreduce(&cksum_diff, &cksum_diff_max, 1, MPI_MAX, MPI_COMM_WORLD); + cksum_diff = cksum_diff_max; } - getCout() << " checksum_avg "; -#else - getCout() << " checksum "; #endif - getCout() << setprecision(prec) << checksum - << setprecision(default_precision) << endl; + const char* cksum_result = "FAILED"; + if (cksum_diff <= cksum_tol) { + cksum_result = "PASSED"; + } + + getCout() << " -- " + << kernel->getLastTime() / kernel->getRunReps() << " sec." + << " x " << kernel->getRunReps() << " rep." + << " " << cksum_result << " checksum" + << endl; } } else { @@ -1364,6 +1368,7 @@ void Executor::writeChecksumReport(ostream& file) } namecol_width++; + size_t resultcol_width = 6+2; // // Print title. @@ -1377,11 +1382,15 @@ void Executor::writeChecksumReport(ostream& file) file << equal_line << endl; // - // Print column title line. + // Print column title lines. // file <getName() << endl; + file << dot_line << endl; - Checksum_type cksum_ref = 0.0; - size_t ivck = 0; - bool found_ref = false; - while ( ivck < variant_ids.size() && !found_ref ) { - VariantID vid = variant_ids[ivck]; - size_t num_tunings = kern->getNumVariantTunings(vid); - for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { - if ( kern->wasVariantTuningRun(vid, tune_idx) ) { - cksum_ref = kern->getChecksum(vid, tune_idx); - found_ref = true; - break; - } - } - ++ivck; - } + Checksum_type cksum_tol = kern->getChecksumTolerance(); + Checksum_type cksum_ref = kern->getReferenceChecksum(); // get vector of checksums and diffs std::vector> checksums(variant_ids.size()); - std::vector> checksums_diff(variant_ids.size()); + std::vector> checksums_abs_diff(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums[iv].resize(num_tunings, 0.0); - checksums_diff[iv].resize(num_tunings, 0.0); + checksums_abs_diff[iv].resize(num_tunings, 0.0); for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { if ( kern->wasVariantTuningRun(vid, tune_idx) ) { checksums[iv][tune_idx] = kern->getChecksum(vid, tune_idx); - checksums_diff[iv][tune_idx] = cksum_ref - kern->getChecksum(vid, tune_idx); + checksums_abs_diff[iv][tune_idx] = std::abs(cksum_ref - kern->getChecksum(vid, tune_idx)); } } } @@ -1462,16 +1463,6 @@ void Executor::writeChecksumReport(ostream& file) } } - // get stats for checksums_abs_diff - std::vector> checksums_abs_diff(variant_ids.size()); - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); - checksums_abs_diff[iv].resize(num_tunings, 0.0); - for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { - checksums_abs_diff[iv][tune_idx] = std::abs(checksums_diff[iv][tune_idx]); - } - } - std::vector> checksums_abs_diff_min(variant_ids.size()); std::vector> checksums_abs_diff_max(variant_ids.size()); std::vector> checksums_abs_diff_sum(variant_ids.size()); @@ -1530,18 +1521,32 @@ void Executor::writeChecksumReport(ostream& file) const string& tuning_name = kern->getVariantTuningName(vid, tune_idx); if ( kern->wasVariantTuningRun(vid, tune_idx) ) { + const char* result = "FAILED"; + if ( +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + checksums_abs_diff_max[iv][tune_idx] +#else + checksums_abs_diff[iv][tune_idx] +#endif + <= cksum_tol ) { + result = "PASSED"; + } file <runKernel(vid, tune_idx); + Checksum_type old_checksum = checksum[vid][tune_idx]; + checksum[vid][tune_idx] = 0; this->updateChecksum(vid, tune_idx); + checksum[vid][tune_idx] *= checksum_scale_factor; + checksum[vid][tune_idx] += old_checksum; + + if (checksum_reference_variant == NumVariants) { + // use first run variant tuning as checksum reference + checksum_reference_variant = vid; + checksum_reference_tuning = tune_idx; + } this->tearDown(vid, tune_idx); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 1fa6f695b..c7d11f6a7 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -122,6 +122,8 @@ class KernelBase void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; } void setBlockSize(Index_type size) { kernel_block_size = size; } void setChecksumConsistency(ChecksumConsistency cc) { checksum_consistency = cc; } + void setChecksumTolerance(Checksum_type ct) { checksum_tolerance = ct; } + void setChecksumScaleFactor(Checksum_type csf) { checksum_scale_factor = csf; } void setComplexity(Complexity ac) { complexity = ac; } void setUsesFeature(FeatureID fid) { uses_feature[fid] = true; } @@ -207,6 +209,7 @@ class KernelBase Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } double getBlockSize() const { return kernel_block_size; } ChecksumConsistency getChecksumConsistency() const { return checksum_consistency; }; + Checksum_type getChecksumTolerance() const { return checksum_tolerance; } Complexity getComplexity() const { return complexity; }; Index_type getTargetProblemSize() const; @@ -276,6 +279,14 @@ class KernelBase double getTotTime(VariantID vid, size_t tune_idx) const { return tot_time[vid].at(tune_idx); } + // Get reference checksum (first variant tuning run) + Checksum_type getReferenceChecksum() const + { + if (checksum_reference_variant == NumVariants) { + throw std::runtime_error("Can't get reference checksum if kernel was not run"); + } + return checksum[checksum_reference_variant].at(checksum_reference_tuning); + } Checksum_type getChecksum(VariantID vid, size_t tune_idx) const { return checksum[vid].at(tune_idx); } @@ -518,25 +529,18 @@ class KernelBase } template - long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, - Real_type scale_factor = 1.0) + Checksum_type calcChecksum(DataSpace dataSpace, T* ptr, Size_type len) { return rajaperf::calcChecksum(dataSpace, - ptr, len, getDataAlignment(), scale_factor); + ptr, len, getDataAlignment()); } template - long double calcChecksum(T* ptr, Size_type len, VariantID vid) + Checksum_type calcChecksum(T* ptr, Size_type len, VariantID vid) { return calcChecksum(getDataSpace(vid), ptr, len); } - template - long double calcChecksum(T* ptr, Size_type len, Real_type scale_factor, VariantID vid) - { - return calcChecksum(getDataSpace(vid), ptr, len, scale_factor); - } - void startTimer() { synchronize(); @@ -610,8 +614,15 @@ class KernelBase protected: const RunParams& run_params; + struct ChecksumTolerance + { + static constexpr inline Checksum_type zero = 0.0; + static constexpr inline Checksum_type tight = 1e-12; + static constexpr inline Checksum_type normal = 1e-7; + static constexpr inline Checksum_type loose = 5e-6; + }; + std::vector checksum[NumVariants]; - Checksum_type checksum_scale_factor; #if defined(RAJA_ENABLE_TARGET_OPENMP) int did; @@ -659,6 +670,8 @@ class KernelBase bool uses_feature[NumFeatures]; ChecksumConsistency checksum_consistency; + Checksum_type checksum_tolerance; + Checksum_type checksum_scale_factor; Complexity complexity; @@ -679,6 +692,9 @@ class KernelBase VariantID running_variant; size_t running_tuning; + VariantID checksum_reference_variant; + size_t checksum_reference_tuning; + std::vector num_exec[NumVariants]; RAJA::Timer timer; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 205fb620e..2ceb6e7f8 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -34,6 +34,7 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setFLOPsPerRep(9 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index e3e5a1a8a..31a391689 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -37,11 +37,11 @@ EOS::EOS(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(16 * getActualProblemSize()); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -68,7 +68,7 @@ void EOS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void EOS::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), vid); } void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index f16cf2f7f..24bd724d9 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -35,7 +35,8 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes FP ops get the same answer across platforms + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index d7cb8782e..0e763cd9f 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -41,7 +41,8 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes that the loc returned is always from the first of equivalent mins + setChecksumConsistency(ChecksumConsistency::Consistent); // The loc returned is always the first of equivalent mins + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); @@ -68,7 +69,7 @@ void FIRST_MIN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void FIRST_MIN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += static_cast(m_minloc); + checksum[vid][tune_idx] += static_cast(m_minloc); } void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index a31e98865..31a759961 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -35,7 +35,8 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * (m_N-1)); - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes FP ops get the same answer across platforms + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 74d8ce2bd..37f7ab2f3 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -38,11 +38,11 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setFLOPsPerRep((3 + 3 ) * m_N); - checksum_scale_factor = 0.01 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.01 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -67,7 +67,7 @@ void GEN_LIN_RECUR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), vid); } void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index ec44c3a69..af42af3bd 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -36,11 +36,11 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(5 * getActualProblemSize()); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -66,7 +66,7 @@ void HYDRO_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), vid); } void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 9d6b2d7e6..4b3550777 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -57,11 +57,11 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) 26 + 4 ) * (m_jn-2)*(m_kn-2)); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -91,8 +91,8 @@ void HYDRO_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, vid); + checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, vid); } void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 470265d7c..df4d20db9 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -34,6 +34,7 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setFLOPsPerRep(17 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index d1316fa73..b83b0e56f 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -34,6 +34,7 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index e99b8e908..807935e93 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -36,6 +36,7 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setFLOPsPerRep(2 * (m_N-1)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index e5046dc46..c9e2fca04 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -56,11 +56,11 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setFLOPsPerRep(3 * m_ni*m_nj*m_nk + 2 * m_ni*m_nj*m_nl ); - checksum_scale_factor = 0.000001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.000001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N_to_the_three_halves); @@ -85,7 +85,7 @@ void POLYBENCH_2MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, vid); } void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index af208551b..68b22fd2e 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -65,11 +65,11 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) 2 * m_nj*m_nl*m_nm + 2 * m_ni*m_nj*m_nl ); - checksum_scale_factor = 0.000000001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.000000001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N_to_the_three_halves); @@ -96,7 +96,7 @@ void POLYBENCH_3MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, vid); } void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 95e679cfd..2c980b549 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -48,11 +48,11 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setFLOPsPerRep( (13 + 2) * (m_n-2)*(m_n-2) + (13 + 2) * (m_n-2)*(m_n-2) ); - checksum_scale_factor = 0.0000001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0000001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -75,7 +75,7 @@ void POLYBENCH_ADI::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, vid); } void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index bb3100060..d323b05c8 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -44,11 +44,11 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -72,7 +72,7 @@ void POLYBENCH_ATAX::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_y, m_N, vid); } void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index aea03e4f2..0d27d65b9 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -67,11 +67,11 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) 3 * m_nx*(m_ny-1) + 5 * (m_nx-1)*(m_ny-1) ); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -94,7 +94,7 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor, vid); + checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, vid); } void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 08dcb5e57..2b686aef8 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -38,11 +38,11 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_N*m_N*m_N * 3 / 2 ); // conditional is true about half of the time - checksum_scale_factor = 1.0 * + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); + setChecksumScaleFactor(1.0 * ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes FP ops get the same answer across platforms + getActualProblemSize() )); setComplexity(Complexity::N_to_the_three_halves); @@ -64,7 +64,7 @@ void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, vid); } void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index e1850135b..9a55291a0 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -48,11 +48,11 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setFLOPsPerRep((1 + 3 * m_nk) * m_ni*m_nj); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N_to_the_three_halves); @@ -75,7 +75,7 @@ void POLYBENCH_GEMM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, vid); } void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 7ea31efea..f8a5a05c5 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -62,11 +62,11 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) 1 * m_n + 3 * m_n*m_n ); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -97,7 +97,7 @@ void POLYBENCH_GEMVER::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_w, m_n, vid); } void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 89ae3fd4a..f035e51c7 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -44,6 +44,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) 3 ) * m_N ); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index d8150325f..df1fd0b5f 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -42,11 +42,11 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setFLOPsPerRep( 15 * (m_N-2) * (m_N-2) * (m_N-2) + 15 * (m_N-2) * (m_N-2) * (m_N-2) ); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -70,8 +70,8 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, vid); } void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index bad9a047c..be2c48866 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -41,11 +41,16 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setFLOPsPerRep( 3 * (m_N-2) + 3 * (m_N-2) ); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); +#if defined(RAJA_ENABLE_TARGET_OPENMP) + // TODO: base omp target variant result is off + setChecksumTolerance(ChecksumTolerance::loose); +#else + setChecksumTolerance(ChecksumTolerance::normal); +#endif + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -69,8 +74,8 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N, checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N, vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N, vid); } void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 08f04bd4e..c8de621c1 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -41,11 +41,11 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setFLOPsPerRep( 5 * (m_N-2)*(m_N-2) + 5 * (m_N-2)*(m_N-2) ); - checksum_scale_factor = 0.0001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(0.0001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -69,8 +69,8 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, vid); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, vid); } void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 8a35c2f65..9ee01e216 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -45,11 +45,11 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); - checksum_scale_factor = 1.0 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); // Change to Inconsistent if internal reductions use atomics + setChecksumTolerance(ChecksumTolerance::normal); + setChecksumScaleFactor(1.0 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -74,8 +74,8 @@ void POLYBENCH_MVT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, checksum_scale_factor , vid); - checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, vid); + checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, vid); } void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index b78bc35f4..f04a5a394 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -33,7 +33,8 @@ ADD::ADD(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes FP ops get the same answer across platforms + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 206acf31b..907c4f292 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -34,6 +34,7 @@ COPY::COPY(const RunParams& params) setFLOPsPerRep(0); setChecksumConsistency(ChecksumConsistency::Consistent); + setChecksumTolerance(ChecksumTolerance::zero); setComplexity(Complexity::N); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index a3d782248..80cc4ab96 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -35,6 +35,7 @@ DOT::DOT(const RunParams& params) setFLOPsPerRep(2 * getActualProblemSize()); setChecksumConsistency(ChecksumConsistency::Inconsistent); + setChecksumTolerance(ChecksumTolerance::normal); setComplexity(Complexity::N); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index dffbf8528..3bed14cae 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -33,7 +33,8 @@ MUL::MUL(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); - setChecksumConsistency(ChecksumConsistency::Consistent); // assumes FP ops get the same answer across platforms + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); setComplexity(Complexity::N); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index b61be0a2f..306f41534 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -33,11 +33,11 @@ TRIAD::TRIAD(const RunParams& params) setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); - checksum_scale_factor = 0.001 * - ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); - setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::tight); + setChecksumScaleFactor(0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() )); setComplexity(Complexity::N); @@ -60,7 +60,7 @@ void TRIAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void TRIAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor , vid); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), vid); } void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 1ad403a8a..3fe2710b0 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -50,9 +50,6 @@ int main( int argc, char** argv ) TEST(ShortSuiteTest, Basic) { - // default checksum tolerance for test pass/fail - rajaperf::Checksum_type chksum_tol = 1e-7; - // Assemble command line args for basic test std::vector< std::string > sargv{}; @@ -76,10 +73,6 @@ TEST(ShortSuiteTest, Basic) #if !defined(_WIN32) #if defined(RAJA_ENABLE_TARGET_OPENMP) - // checksum tolerance reduced b/c bas omp target variant of JACOBI_1D - // kernel result is off - chksum_tol = 5e-6; - sargv.emplace_back(std::string("--exclude-kernels")); sargv.emplace_back(std::string("Comm")); sargv.emplace_back(std::string("EDGE3D")); @@ -134,27 +127,8 @@ TEST(ShortSuiteTest, Basic) rajaperf::KernelBase* kernel = kernels[ik]; - // - // Get reference checksum (first kernel variant run) - // - rajaperf::Checksum_type cksum_ref = 0.0; - size_t ivck = 0; - bool found_ref = false; - while ( ivck < variant_ids.size() && !found_ref ) { - - rajaperf::VariantID vid = variant_ids[ivck]; - size_t num_tunings = kernel->getNumVariantTunings(vid); - for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { - if ( kernel->wasVariantTuningRun(vid, tune_idx) ) { - cksum_ref = kernel->getChecksum(vid, tune_idx); - found_ref = true; - break; - } - } - ++ivck; - - } // while loop over variants until reference checksum found - + rajaperf::Checksum_type cksum_tol = kernel->getChecksumTolerance(); + rajaperf::Checksum_type cksum_ref = kernel->getReferenceChecksum(); // // Check execution time is greater than zero and checksum diff is @@ -180,7 +154,7 @@ TEST(ShortSuiteTest, Basic) << kernel->getVariantTuningName(vid, tune_idx) << std::endl; EXPECT_GT(rtime, 0.0); - EXPECT_LT(cksum_diff, chksum_tol); + EXPECT_LE(cksum_diff, cksum_tol); } }