llnl · MrBurmark · Dec 16, 2025 · Dec 16, 2025 · Dec 17, 2025 · Dec 18, 2025
diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst
@@ -50,8 +50,13 @@ The methods in the source file are:
         is the problem size and the number of loop kernels is one. Other kernels
         in the Suite may execute multiple loop kernels with different sizes,
         so these methods are used to describe this.
-      * The number of bytes read and written and the number of floating point
-        operations (FLOPS) performed for each kernel execution.
+      * The number of bytes read for each kernel execution.
+      * The number of bytes written for each kernel execution.
+      * The number of bytes read, modified, and written for each kernel execution.
+      * The number of bytes atomically read, modified, and written for each
+        kernel execution.
+      * The number of floating point operations (FLOPS) performed for each
+        kernel execution.
       * The consistency of the checksums of the kernel. The possible values are
         ``Consistent`` where all the variant tunings always get the same checksum,
         ``ConsistentPerVariantTuning`` where an individual variant tuning always
@@ -66,6 +71,22 @@ The methods in the source file are:
         not every kernel implements every variant, so ``KernelBase`` provides a
         "default" implementation that defines no variants or tunings.
 
+    ..note:: The byte counters are intended to count traffic to and from main
+             memory like DRAM or HBM under idealized conditions with perfect
+             caching. They are not intended to count the total number of bytes
+             requested by load and store instructions. So, even if a memory
+             address is read in multiple different iterations of a loop with a
+             stencil access pattern it is only counted once in bytes read.
+             However caching is not assumed between loops/kernel launches so an
+             address is counted once for each separate loop or kernel launch.
+
+    ..note:: To simplify counting each address accessed should only be counted
+             in one of the byte counter attributes. For example an address
+             that is read and written is counted in the "read, modified, and
+             written" counter, but not in the "read" or "written" counters. The
+             final output however does add the "read" and "read, modified, and
+             written" counters when showing the bytes read.
+
     ..note:: Available variant tunings for each kernel are specified using a
              ``...BOILERPLATE...`` macro invocation in each kernel variant
              source file. This is discussed in :ref:`kernel_class_impl_exec-label`.

diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
@@ -80,21 +80,33 @@ Information reported in the file for each kernel is:
   * **Kernels/rep** -- total number of loop structures run (or GPU kernels 
     launched) in each kernel repetition.
   * **Bytes/rep** -- Total number of bytes read from and written to memory for 
-    each repetition of kernel.
+    each repetition of kernel. This is a best case scenario of the total traffic
+    to and from memory assuming perfect cache reuse and ignoring partial usage
+    of data in some memory transactions.
   * **FLOPs/rep** -- Total number of floating point operations executed for 
     each repetition of kernel. Currently, we count arithmetic operations 
     (+, -, *, /) and functions, such as exp, sin, etc. as one FLOP. We do not 
     currently count operations like abs and comparisons (<, >, etc.) in the 
     FLOP count. So these numbers are rough estimates. For actual FLOP counts, 
     a performance analysis tool should be used.
+  * **BytesTouched/rep** -- Total number of bytes accessed for each repetition
+    of kernel. This is a best case scenario for the amount of cache needed to
+    fit all of the data used by the kernel ignoring partial usage of some cache
+    lines.
   * **BytesRead/rep** -- Total number of bytes read from memory for 
     each repetition of kernel.
   * **BytesWritten/rep** -- Total number of bytes written to memory for 
     each repetition of kernel.
+  * **BytesModifyWritten/rep** -- Total number of bytes modified for each
+    repetition of kernel. The intersection of bytes in both ``BytesRead/rep``
+    and ``BytesWritten/rep``.
   * **BytesAtomicModifyWritten/rep** -- Total number of bytes modified by 
     atomic operations in a kernel. If a kernel contains no atomic operations, 
     the value of zero is reported.
 
+  ..note:: The Bytes*/rep and FLOPs/rep counts are estimates for kernels
+           involving randomness or difficult to count algorithms.
+
 .. _output_probsize-label:
 
 ============================

diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp
@@ -30,7 +30,8 @@ ATOMIC::ATOMIC(const RunParams& params)
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // atomic (assumes replication == problem size)
   setFLOPsPerRep(getActualProblemSize());
 
   setChecksumConsistency(ChecksumConsistency::Inconsistent); // atomics

diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp
@@ -34,10 +34,10 @@ HISTOGRAM::HISTOGRAM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
-                      1*sizeof(Index_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
-  setBytesAtomicModifyWrittenPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getActualProblemSize() ); // bins
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); // counts
   setFLOPsPerRep( (std::is_floating_point_v<Data_type> ? 1 : 0) * getActualProblemSize() );
 
   setChecksumConsistency(ChecksumConsistency::Consistent); // integer arithmetic

diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp
@@ -28,8 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 

diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp
@@ -10,7 +10,7 @@
 /// MEMCPY kernel reference implementation:
 ///
 /// for (Index_type i = ibegin; i < iend; ++i ) {
-///   x[i] = val ;
+///   y[i] = x[i] ;
 /// }
 ///
 

diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp
@@ -29,7 +29,8 @@ MEMSET::MEMSET(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 

diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp
@@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(getActualProblemSize());
 

diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp
@@ -28,8 +28,9 @@ SCAN::SCAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 

diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp
@@ -28,8 +28,10 @@ SORT::SORT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 

diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp
@@ -28,8 +28,10 @@ SORTPAIRS::SORTPAIRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x, i
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 

diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp
@@ -36,9 +36,10 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g
-                      2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x
                CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (

diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate
-  setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // x, y, fx, fy (2d nodal stencil pattern: 4 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // div
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(54 * m_domain->n_real_zones);
 

diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp
@@ -36,9 +36,10 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g
-                      2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x
                 DPA_SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D +

diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp
@@ -40,8 +40,9 @@ EDGE3D::EDGE3D(const RunParams& params)
 
   // touched data size, not actual number of stores and loads
   // see VOL3D.cpp
-  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); // x, y, z (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // sum
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr size_t flops_k_loop = 15

diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
@@ -29,21 +29,28 @@ ENERGY::ENERGY(const RunParams& params)
   setItsPerRep( 6 * getActualProblemSize() );
   setKernelsPerRep(6);
   // some branches are never taken due to the nature of the initialization of delvc
-  // the additional reads that would be done if those branches were taken are noted in the comments
-  setBytesReadPerRep((5*sizeof(Real_type) +
-                      1*sizeof(Real_type) + // 8
-                      6*sizeof(Real_type) +
-                      2*sizeof(Real_type) +
-                      7*sizeof(Real_type) + // 12
-                      1*sizeof(Real_type)   // 8
+  // the additional ops that would be done if those branches were taken are noted in the comments
+  setBytesReadPerRep((5*sizeof(Real_type) + // e_old, delvc, p_old, q_old, work
+                      1*sizeof(Real_type) + // delvc (+7 : compHalfStep, pbvc, e_new, bvc, pHalfStep, ql_old, qq_old)
+                      5*sizeof(Real_type) + // delvc, p_old, q_old, pHalfStep, q_new
+                      1*sizeof(Real_type) + // work
+                      6*sizeof(Real_type) + // delvc p_old, q_old, pHalfStep, q_new, p_new (+5 : pbvc, vnewc, bvc, ql_old, qq_old )
+                      1*sizeof(Real_type)   // delvc (+7 : pbvc, e_new, vnewc, bvc, p_new, ql_old, qq_old )
                       ) * getActualProblemSize() );
-  setBytesWrittenPerRep((1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         0*sizeof(Real_type)
+  setBytesWrittenPerRep((1*sizeof(Real_type) + // e_new
+                         1*sizeof(Real_type) + // q_new
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type)   // (+1 : q_new )
                          ) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( (0*sizeof(Real_type) +
+                                0*sizeof(Real_type) +
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                0*sizeof(Real_type)
+                                ) * getActualProblemSize() );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((6  +
                   11 + // 1 sqrt

diff --git a/src/apps/FEMSWEEP.cpp b/src/apps/FEMSWEEP.cpp
@@ -48,13 +48,13 @@ FEMSWEEP::FEMSWEEP(const RunParams& params)
   setItsPerRep(1);
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen +
-                      1*sizeof(Real_type) * m_Alen +
-                      1*sizeof(Real_type) * m_Flen +
-                      1*sizeof(Real_type) * m_Sglen +
-                      1*sizeof(Real_type) * m_M0len +
-                      1*sizeof(Real_type) * m_Xlen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_Xlen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen + // Bdat
+                      1*sizeof(Real_type) * m_Alen + // Adat
+                      1*sizeof(Real_type) * m_Flen + // Fdat
+                      1*sizeof(Real_type) * m_Sglen + // Sgdat
+                      1*sizeof(Real_type) * m_M0len ); // M0dat
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_Xlen ); // Xdat
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   // This is an estimate of the upper bound FLOPs.

diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
@@ -30,9 +30,10 @@ FIR::FIR(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( m_coefflen*sizeof(Real_type) +
-                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( m_coefflen*sizeof(Real_type) + // coeff
+                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) ); // in
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // out
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize());
 

diff --git a/src/apps/INTSC_HEXHEX.cpp b/src/apps/INTSC_HEXHEX.cpp
@@ -65,6 +65,7 @@ INTSC_HEXHEX::INTSC_HEXHEX(const RunParams& params)
   //   A standard intersection is 8 subzone intersections.
   //
   setBytesWrittenPerRep( 13*8*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 336 ;

diff --git a/src/apps/INTSC_HEXRECT.cpp b/src/apps/INTSC_HEXRECT.cpp
@@ -73,6 +73,7 @@ INTSC_HEXRECT::INTSC_HEXRECT(const RunParams& params)
 
   // Bytes written : nvals_hexrect (=4) doubles for each intersection.
   setBytesWrittenPerRep( nvals_hexrect*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 150 ;

diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
@@ -42,10 +42,10 @@ LTIMES::LTIMES(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 

diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
@@ -42,10 +42,10 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 

diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp
@@ -40,6 +40,7 @@ MASS3DEA::MASS3DEA(const RunParams& params)
   setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B
                       1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D
   setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * 7 * ea_mat_entries);

diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp
@@ -36,9 +36,10 @@ MASS3DPA::MASS3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt
-                      2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y
+                      1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X
                       1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D +

diff --git a/src/apps/MASSVEC3DPA.cpp b/src/apps/MASSVEC3DPA.cpp
@@ -43,6 +43,7 @@ MASSVEC3DPA::MASSVEC3DPA(const RunParams &params)
                          m_NE); // D
   setBytesWrittenPerRep(3 * sizeof(Real_type) * MVPA_D1D * MVPA_D1D * MVPA_D1D *
                         MVPA_DIM * m_NE); // Y (3 components)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep(0);
 
   //3 for the dimension loop