diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst
index 4bf46230f..10ed11dd6 100644
--- a/docs/sphinx/dev_guide/kernel_class_impl.rst
+++ b/docs/sphinx/dev_guide/kernel_class_impl.rst
@@ -50,8 +50,13 @@ The methods in the source file are:
         is the problem size and the number of loop kernels is one. Other kernels
         in the Suite may execute multiple loop kernels with different sizes,
         so these methods are used to describe this.
-      * The number of bytes read and written and the number of floating point
-        operations (FLOPS) performed for each kernel execution.
+      * The number of bytes read for each kernel execution.
+      * The number of bytes written for each kernel execution.
+      * The number of bytes read, modified, and written for each kernel execution.
+      * The number of bytes atomically read, modified, and written for each
+        kernel execution.
+      * The number of floating point operations (FLOPS) performed for each
+        kernel execution.
       * The consistency of the checksums of the kernel. The possible values are
         ``Consistent`` where all the variant tunings always get the same checksum,
         ``ConsistentPerVariantTuning`` where an individual variant tuning always
@@ -66,6 +71,22 @@ The methods in the source file are:
         not every kernel implements every variant, so ``KernelBase`` provides a
         "default" implementation that defines no variants or tunings.
 
+    ..note:: The byte counters are intended to count traffic to and from main
+             memory like DRAM or HBM under idealized conditions with perfect
+             caching. They are not intended to count the total number of bytes
+             requested by load and store instructions. So, even if a memory
+             address is read in multiple different iterations of a loop with a
+             stencil access pattern it is only counted once in bytes read.
+             However caching is not assumed between loops/kernel launches so an
+             address is counted once for each separate loop or kernel launch.
+
+    ..note:: To simplify counting each address accessed should only be counted
+             in one of the byte counter attributes. For example an address
+             that is read and written is counted in the "read, modified, and
+             written" counter, but not in the "read" or "written" counters. The
+             final output however does add the "read" and "read, modified, and
+             written" counters when showing the bytes read.
+
     ..note:: Available variant tunings for each kernel are specified using a
              ``...BOILERPLATE...`` macro invocation in each kernel variant
              source file. This is discussed in :ref:`kernel_class_impl_exec-label`.
diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index 522f220a8..3fee5cf32 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -80,21 +80,33 @@ Information reported in the file for each kernel is:
   * **Kernels/rep** -- total number of loop structures run (or GPU kernels 
     launched) in each kernel repetition.
   * **Bytes/rep** -- Total number of bytes read from and written to memory for 
-    each repetition of kernel.
+    each repetition of kernel. This is a best case scenario of the total traffic
+    to and from memory assuming perfect cache reuse and ignoring partial usage
+    of data in some memory transactions.
   * **FLOPs/rep** -- Total number of floating point operations executed for 
     each repetition of kernel. Currently, we count arithmetic operations 
     (+, -, *, /) and functions, such as exp, sin, etc. as one FLOP. We do not 
     currently count operations like abs and comparisons (<, >, etc.) in the 
     FLOP count. So these numbers are rough estimates. For actual FLOP counts, 
     a performance analysis tool should be used.
+  * **BytesTouched/rep** -- Total number of bytes accessed for each repetition
+    of kernel. This is a best case scenario for the amount of cache needed to
+    fit all of the data used by the kernel ignoring partial usage of some cache
+    lines.
   * **BytesRead/rep** -- Total number of bytes read from memory for 
     each repetition of kernel.
   * **BytesWritten/rep** -- Total number of bytes written to memory for 
     each repetition of kernel.
+  * **BytesModifyWritten/rep** -- Total number of bytes modified for each
+    repetition of kernel. The intersection of bytes in both ``BytesRead/rep``
+    and ``BytesWritten/rep``.
   * **BytesAtomicModifyWritten/rep** -- Total number of bytes modified by 
     atomic operations in a kernel. If a kernel contains no atomic operations, 
     the value of zero is reported.
 
+  ..note:: The Bytes*/rep and FLOPs/rep counts are estimates for kernels
+           involving randomness or difficult to count algorithms.
+
 .. _output_probsize-label:
 
 ============================
diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp
index 82e825562..bbca956c9 100644
--- a/src/algorithm/ATOMIC.cpp
+++ b/src/algorithm/ATOMIC.cpp
@@ -30,7 +30,8 @@ ATOMIC::ATOMIC(const RunParams& params)
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // atomic (assumes replication == problem size)
   setFLOPsPerRep(getActualProblemSize());
 
   setChecksumConsistency(ChecksumConsistency::Inconsistent); // atomics
diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp
index 27325a0dc..9a51fde48 100644
--- a/src/algorithm/HISTOGRAM.cpp
+++ b/src/algorithm/HISTOGRAM.cpp
@@ -34,10 +34,10 @@ HISTOGRAM::HISTOGRAM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
-                      1*sizeof(Index_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
-  setBytesAtomicModifyWrittenPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getActualProblemSize() ); // bins
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); // counts
   setFLOPsPerRep( (std::is_floating_point_v<Data_type> ? 1 : 0) * getActualProblemSize() );
 
   setChecksumConsistency(ChecksumConsistency::Consistent); // integer arithmetic
diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp
index 16142c9ee..c4d86688d 100644
--- a/src/algorithm/MEMCPY.cpp
+++ b/src/algorithm/MEMCPY.cpp
@@ -28,8 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp
index 02f814788..7a20c056b 100644
--- a/src/algorithm/MEMCPY.hpp
+++ b/src/algorithm/MEMCPY.hpp
@@ -10,7 +10,7 @@
 /// MEMCPY kernel reference implementation:
 ///
 /// for (Index_type i = ibegin; i < iend; ++i ) {
-///   x[i] = val ;
+///   y[i] = x[i] ;
 /// }
 ///
 
diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp
index 9b1466b00..010f65ec0 100644
--- a/src/algorithm/MEMSET.cpp
+++ b/src/algorithm/MEMSET.cpp
@@ -29,7 +29,8 @@ MEMSET::MEMSET(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp
index 68efa8e79..7f218b53a 100644
--- a/src/algorithm/REDUCE_SUM.cpp
+++ b/src/algorithm/REDUCE_SUM.cpp
@@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(getActualProblemSize());
 
diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp
index 587c03b82..e056c83d3 100644
--- a/src/algorithm/SCAN.cpp
+++ b/src/algorithm/SCAN.cpp
@@ -28,8 +28,9 @@ SCAN::SCAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp
index 6e011ad27..f12d8faca 100644
--- a/src/algorithm/SORT.cpp
+++ b/src/algorithm/SORT.cpp
@@ -28,8 +28,10 @@ SORT::SORT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp
index ca5407481..28d20e75e 100644
--- a/src/algorithm/SORTPAIRS.cpp
+++ b/src/algorithm/SORTPAIRS.cpp
@@ -28,8 +28,10 @@ SORTPAIRS::SORTPAIRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x, i
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp
index 8ad78034d..b53671c22 100644
--- a/src/apps/CONVECTION3DPA.cpp
+++ b/src/apps/CONVECTION3DPA.cpp
@@ -36,9 +36,10 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g
-                      2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x
                CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (
diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
index 4dee604a3..412b99f53 100644
--- a/src/apps/DEL_DOT_VEC_2D.cpp
+++ b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate
-  setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // x, y, fx, fy (2d nodal stencil pattern: 4 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // div
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(54 * m_domain->n_real_zones);
 
diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp
index ce71abe98..55977f325 100644
--- a/src/apps/DIFFUSION3DPA.cpp
+++ b/src/apps/DIFFUSION3DPA.cpp
@@ -36,9 +36,10 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g
-                      2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x
                 DPA_SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D +
diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp
index 939d9c0f9..944a47ea4 100644
--- a/src/apps/EDGE3D.cpp
+++ b/src/apps/EDGE3D.cpp
@@ -40,8 +40,9 @@ EDGE3D::EDGE3D(const RunParams& params)
 
   // touched data size, not actual number of stores and loads
   // see VOL3D.cpp
-  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); // x, y, z (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // sum
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr size_t flops_k_loop = 15
diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
index bc2ac2b17..dd567d245 100644
--- a/src/apps/ENERGY.cpp
+++ b/src/apps/ENERGY.cpp
@@ -29,21 +29,28 @@ ENERGY::ENERGY(const RunParams& params)
   setItsPerRep( 6 * getActualProblemSize() );
   setKernelsPerRep(6);
   // some branches are never taken due to the nature of the initialization of delvc
-  // the additional reads that would be done if those branches were taken are noted in the comments
-  setBytesReadPerRep((5*sizeof(Real_type) +
-                      1*sizeof(Real_type) + // 8
-                      6*sizeof(Real_type) +
-                      2*sizeof(Real_type) +
-                      7*sizeof(Real_type) + // 12
-                      1*sizeof(Real_type)   // 8
+  // the additional ops that would be done if those branches were taken are noted in the comments
+  setBytesReadPerRep((5*sizeof(Real_type) + // e_old, delvc, p_old, q_old, work
+                      1*sizeof(Real_type) + // delvc (+7 : compHalfStep, pbvc, e_new, bvc, pHalfStep, ql_old, qq_old)
+                      5*sizeof(Real_type) + // delvc, p_old, q_old, pHalfStep, q_new
+                      1*sizeof(Real_type) + // work
+                      6*sizeof(Real_type) + // delvc p_old, q_old, pHalfStep, q_new, p_new (+5 : pbvc, vnewc, bvc, ql_old, qq_old )
+                      1*sizeof(Real_type)   // delvc (+7 : pbvc, e_new, vnewc, bvc, p_new, ql_old, qq_old )
                       ) * getActualProblemSize() );
-  setBytesWrittenPerRep((1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         0*sizeof(Real_type)
+  setBytesWrittenPerRep((1*sizeof(Real_type) + // e_new
+                         1*sizeof(Real_type) + // q_new
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type)   // (+1 : q_new )
                          ) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( (0*sizeof(Real_type) +
+                                0*sizeof(Real_type) +
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                0*sizeof(Real_type)
+                                ) * getActualProblemSize() );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((6  +
                   11 + // 1 sqrt
diff --git a/src/apps/FEMSWEEP.cpp b/src/apps/FEMSWEEP.cpp
index e24344739..e2dea215c 100644
--- a/src/apps/FEMSWEEP.cpp
+++ b/src/apps/FEMSWEEP.cpp
@@ -48,13 +48,13 @@ FEMSWEEP::FEMSWEEP(const RunParams& params)
   setItsPerRep(1);
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen +
-                      1*sizeof(Real_type) * m_Alen +
-                      1*sizeof(Real_type) * m_Flen +
-                      1*sizeof(Real_type) * m_Sglen +
-                      1*sizeof(Real_type) * m_M0len +
-                      1*sizeof(Real_type) * m_Xlen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_Xlen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen + // Bdat
+                      1*sizeof(Real_type) * m_Alen + // Adat
+                      1*sizeof(Real_type) * m_Flen + // Fdat
+                      1*sizeof(Real_type) * m_Sglen + // Sgdat
+                      1*sizeof(Real_type) * m_M0len ); // M0dat
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_Xlen ); // Xdat
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   // This is an estimate of the upper bound FLOPs.
diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
index a359327c0..6c507e7b0 100644
--- a/src/apps/FIR.cpp
+++ b/src/apps/FIR.cpp
@@ -30,9 +30,10 @@ FIR::FIR(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( m_coefflen*sizeof(Real_type) +
-                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( m_coefflen*sizeof(Real_type) + // coeff
+                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) ); // in
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // out
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize());
 
diff --git a/src/apps/INTSC_HEXHEX.cpp b/src/apps/INTSC_HEXHEX.cpp
index 7c4d01852..dec325e44 100644
--- a/src/apps/INTSC_HEXHEX.cpp
+++ b/src/apps/INTSC_HEXHEX.cpp
@@ -65,6 +65,7 @@ INTSC_HEXHEX::INTSC_HEXHEX(const RunParams& params)
   //   A standard intersection is 8 subzone intersections.
   //
   setBytesWrittenPerRep( 13*8*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 336 ;
diff --git a/src/apps/INTSC_HEXRECT.cpp b/src/apps/INTSC_HEXRECT.cpp
index 35a52cc87..3a9e0eec6 100644
--- a/src/apps/INTSC_HEXRECT.cpp
+++ b/src/apps/INTSC_HEXRECT.cpp
@@ -73,6 +73,7 @@ INTSC_HEXRECT::INTSC_HEXRECT(const RunParams& params)
 
   // Bytes written : nvals_hexrect (=4) doubles for each intersection.
   setBytesWrittenPerRep( nvals_hexrect*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 150 ;
diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
index 9d8e193f9..0c6110ce8 100644
--- a/src/apps/LTIMES.cpp
+++ b/src/apps/LTIMES.cpp
@@ -42,10 +42,10 @@ LTIMES::LTIMES(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
index 59f350dad..670d8ddb5 100644
--- a/src/apps/LTIMES_NOVIEW.cpp
+++ b/src/apps/LTIMES_NOVIEW.cpp
@@ -42,10 +42,10 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp
index 0155be304..90c6fbb30 100644
--- a/src/apps/MASS3DEA.cpp
+++ b/src/apps/MASS3DEA.cpp
@@ -40,6 +40,7 @@ MASS3DEA::MASS3DEA(const RunParams& params)
   setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B
                       1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D
   setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * 7 * ea_mat_entries);
diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp
index 55eea8090..8838d7741 100644
--- a/src/apps/MASS3DPA.cpp
+++ b/src/apps/MASS3DPA.cpp
@@ -36,9 +36,10 @@ MASS3DPA::MASS3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt
-                      2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y
+                      1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X
                       1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D +
diff --git a/src/apps/MASSVEC3DPA.cpp b/src/apps/MASSVEC3DPA.cpp
index 9759d5a96..3d41a9915 100644
--- a/src/apps/MASSVEC3DPA.cpp
+++ b/src/apps/MASSVEC3DPA.cpp
@@ -43,6 +43,7 @@ MASSVEC3DPA::MASSVEC3DPA(const RunParams &params)
                          m_NE); // D
   setBytesWrittenPerRep(3 * sizeof(Real_type) * MVPA_D1D * MVPA_D1D * MVPA_D1D *
                         MVPA_DIM * m_NE); // Y (3 components)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep(0);
 
   //3 for the dimension loop
diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp
index 8f98d6c4e..7bc5d4e83 100644
--- a/src/apps/MATVEC_3D_STENCIL.cpp
+++ b/src/apps/MATVEC_3D_STENCIL.cpp
@@ -69,10 +69,11 @@ MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params)
                             get_size_matrix(1, 1, 1) +
                             get_size_matrix(0, 1, 1) +
                             get_size_matrix(1, 1, 1) ;
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * x_accessed +
-                      1*sizeof(Real_type) * m_accessed );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * x_accessed + // x
+                      1*sizeof(Real_type) * m_accessed ); // m
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed ); // b
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   const size_t multiplies = 27;
diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp
index 1f09d4c22..5e77a72b8 100644
--- a/src/apps/NODAL_ACCUMULATION_3D.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D.cpp
@@ -39,10 +39,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * getItsPerRep() ); // vol
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes ); // x (3d nodal stencil pattern: 8 touches per iterate)
   setFLOPsPerRep(9 * getItsPerRep());
 
   checksum_scale_factor = 0.001 *
diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp
index 6cded3f57..5d485b6ff 100644
--- a/src/apps/PRESSURE.cpp
+++ b/src/apps/PRESSURE.cpp
@@ -28,10 +28,11 @@ PRESSURE::PRESSURE(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-                      3*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-                         1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // bvc
+                      3*sizeof(Real_type) * getActualProblemSize() ); // bvc, e_old, vnewc
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // compression
+                         1*sizeof(Real_type) * getActualProblemSize() ); // p_new
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 +
                   1
diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp
index d541622bb..7f65abd58 100644
--- a/src/apps/VOL3D.cpp
+++ b/src/apps/VOL3D.cpp
@@ -38,8 +38,9 @@ VOL3D::VOL3D(const RunParams& params)
   setItsPerRep( m_domain->lpz+1 - m_domain->fpz );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); // x, y, z (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // vol
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz));
 
diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp
index 2f6f75975..8a87731cd 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp
@@ -39,9 +39,10 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * m_domain->n_real_nodes );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * m_domain->n_real_nodes ); // x (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // vol
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(8 * getItsPerRep());
 
diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp
index 0db7901ac..ac64245d8 100644
--- a/src/basic/ARRAY_OF_PTRS.cpp
+++ b/src/basic/ARRAY_OF_PTRS.cpp
@@ -30,8 +30,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(m_array_size * getActualProblemSize());
 
diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp
index 5d27573d2..c75794667 100644
--- a/src/basic/COPY8.cpp
+++ b/src/basic/COPY8.cpp
@@ -28,8 +28,9 @@ COPY8::COPY8(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); // x0, x1, x2, x3, x4, x5, x6, x7
+  setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); // y0, y1, y2, y3, y4, y5, y6, y7
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp
index 0a596a005..277ab567e 100644
--- a/src/basic/DAXPY.cpp
+++ b/src/basic/DAXPY.cpp
@@ -28,8 +28,9 @@ DAXPY::DAXPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp
index 97129f832..d9e3dfed6 100644
--- a/src/basic/DAXPY_ATOMIC.cpp
+++ b/src/basic/DAXPY_ATOMIC.cpp
@@ -28,9 +28,10 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
   setFLOPsPerRep(2 * getActualProblemSize());
 
   setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning);
diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp
index 8787d821c..edb8b3497 100644
--- a/src/basic/EMPTY.cpp
+++ b/src/basic/EMPTY.cpp
@@ -30,6 +30,7 @@ EMPTY::EMPTY(const RunParams& params)
   setKernelsPerRep( 1 );
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 0 );
 
diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp
index 21c9e2710..feab1d994 100644
--- a/src/basic/IF_QUAD.cpp
+++ b/src/basic/IF_QUAD.cpp
@@ -28,8 +28,9 @@ IF_QUAD::IF_QUAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); // b, a, c
+  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x2, x1
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   // estimate conditional true half of the time, 1 sqrt
   setFLOPsPerRep(4 * getActualProblemSize() +
diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp
index ed923d13f..354e274b2 100644
--- a/src/basic/INDEXLIST.cpp
+++ b/src/basic/INDEXLIST.cpp
@@ -28,10 +28,9 @@ INDEXLIST::INDEXLIST(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) +
-                      1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) +
-                         1*sizeof(Int_type) * getActualProblemSize() / 2 ); // about 50% output
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Int_type) * getActualProblemSize() / 2 ); // list (about 50% output)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp
index 3d9a5a6f4..8fb19321e 100644
--- a/src/basic/INDEXLIST_3LOOP.cpp
+++ b/src/basic/INDEXLIST_3LOOP.cpp
@@ -28,18 +28,15 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params)
 
   setItsPerRep( 3 * getActualProblemSize() + 1 );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-
-                      1*sizeof(Index_type) +
-                      1*sizeof(Index_type) * (getActualProblemSize()+1) +
-
-                      1*sizeof(Index_type) * (getActualProblemSize()+1) );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() +
-
-                         1*sizeof(Index_type) +
-                         1*sizeof(Index_type) * (getActualProblemSize()+1) +
-
-                         1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // about 50% output
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // x
+                      0 +
+                      1*sizeof(Index_type) * (getActualProblemSize()+1) ); // counts
+  setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() + // counts
+                         0 +
+                         1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // list (about 50% output)
+  setBytesModifyWrittenPerRep( 0 +
+                               1*sizeof(Index_type) * (getActualProblemSize()+1) + // counts
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp
index da3a02663..e60c70f2c 100644
--- a/src/basic/INIT3.cpp
+++ b/src/basic/INIT3.cpp
@@ -28,8 +28,9 @@ INIT3::INIT3(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // in1, in2
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  ); // out3, out2, out1
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp
index 9fef02ced..ebf2d1964 100644
--- a/src/basic/INIT_VIEW1D.cpp
+++ b/src/basic/INIT_VIEW1D.cpp
@@ -29,7 +29,8 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp
index 1e594cbc6..75e6452db 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.cpp
@@ -29,7 +29,8 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp
index 4844504c7..179eb2715 100644
--- a/src/basic/MAT_MAT_SHARED.cpp
+++ b/src/basic/MAT_MAT_SHARED.cpp
@@ -32,8 +32,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
   setItsPerRep( num_tiles*num_tiles * TL_SZ*TL_SZ );
   setKernelsPerRep(1);
 
-  setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N ); // A, B
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N  ); // C
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(2 * TL_SZ * TL_SZ * TL_SZ * num_tiles * num_tiles * num_tiles);
diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp
index c2afe48ca..4c6d92ef2 100644
--- a/src/basic/MULADDSUB.cpp
+++ b/src/basic/MULADDSUB.cpp
@@ -28,8 +28,9 @@ MULADDSUB::MULADDSUB(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // in1, in2
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  ); // out1, out2, out3
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp
index 9e7daf4c9..2f8680648 100644
--- a/src/basic/MULTI_REDUCE.cpp
+++ b/src/basic/MULTI_REDUCE.cpp
@@ -34,11 +34,11 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
-                      1*sizeof(Data_type) * getActualProblemSize() +
-                      1*sizeof(Index_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
-  setBytesAtomicModifyWrittenPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Data_type) * getActualProblemSize() + // bins
+                      1*sizeof(Index_type) * getActualProblemSize() ); // data
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); // values
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setChecksumConsistency(ChecksumConsistency::Inconsistent);
diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp
index 29f6db466..486be7ee2 100644
--- a/src/basic/NESTED_INIT.cpp
+++ b/src/basic/NESTED_INIT.cpp
@@ -40,7 +40,8 @@ NESTED_INIT::NESTED_INIT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // array
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp
index b1be49296..852099179 100644
--- a/src/basic/PI_ATOMIC.cpp
+++ b/src/basic/PI_ATOMIC.cpp
@@ -30,7 +30,8 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params)
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0  );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) ); // pi
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
   setChecksumConsistency(ChecksumConsistency::Inconsistent);
diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp
index b6507ee6a..56f35ae5f 100644
--- a/src/basic/PI_REDUCE.cpp
+++ b/src/basic/PI_REDUCE.cpp
@@ -28,8 +28,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp
index e3bfda3d2..cd7a2926b 100644
--- a/src/basic/REDUCE3_INT.cpp
+++ b/src/basic/REDUCE3_INT.cpp
@@ -33,9 +33,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Int_type) +
-                      1*sizeof(Int_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Int_type) );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getActualProblemSize() ); // vec
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp
index 2217a0b91..949a351ef 100644
--- a/src/basic/REDUCE_STRUCT.cpp
+++ b/src/basic/REDUCE_STRUCT.cpp
@@ -33,9 +33,9 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 6*sizeof(Real_type) +
-                      2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 6*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x, y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize() + 2);
 
diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp
index d921acb7e..b708d381f 100644
--- a/src/basic/TRAP_INT.cpp
+++ b/src/basic/TRAP_INT.cpp
@@ -28,8 +28,9 @@ TRAP_INT::TRAP_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt
 
diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp
index b9e410bc3..6a91ce2ab 100644
--- a/src/comm/HALO_EXCHANGE.cpp
+++ b/src/comm/HALO_EXCHANGE.cpp
@@ -32,18 +32,19 @@ HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // send
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // (pack|send)_buffers (MPI)
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size +  // recv
+                         1*sizeof(Real_type) * m_num_vars * halo_size +  // (recv|unpack)_buffers (MPI)
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp
index 907fcfd5c..227a10bda 100644
--- a/src/comm/HALO_EXCHANGE_FUSED.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED.cpp
@@ -32,18 +32,19 @@ HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // send
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // (pack|send)_buffers (MPI)
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size +  // recv
+                         1*sizeof(Real_type) * m_num_vars * halo_size +  // (recv|unpack)_buffers (MPI)
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp
index ca7d7ee9f..6ed165613 100644
--- a/src/comm/HALO_PACKING.cpp
+++ b/src/comm/HALO_PACKING.cpp
@@ -26,14 +26,15 @@ HALO_PACKING::HALO_PACKING(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp
index 978031419..f503c0951 100644
--- a/src/comm/HALO_PACKING_FUSED.cpp
+++ b/src/comm/HALO_PACKING_FUSED.cpp
@@ -26,14 +26,15 @@ HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp
index 9be78ecf3..0c7cfc574 100644
--- a/src/comm/HALO_SENDRECV.cpp
+++ b/src/comm/HALO_SENDRECV.cpp
@@ -32,8 +32,9 @@ HALO_SENDRECV::HALO_SENDRECV(const RunParams& params)
 
   setItsPerRep( 0 );
   setKernelsPerRep( 0 );
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // send
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // recv
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // send_buffers (MPI)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // recv_buffers (MPI)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index e90ebd149..b463fc840 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -529,8 +529,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
   Index_type itsrep_width = 0;
   Index_type bytesrep_width = 0;
   Index_type flopsrep_width = 0;
+  Index_type bytesTouchedrep_width = 0;
   Index_type bytesReadrep_width = 0;
   Index_type bytesWrittenrep_width = 0;
+  Index_type bytesModifyWrittenrep_width = 0;
   Index_type bytesAtomicModifyWrittenrep_width = 0;
   Index_type dash_width = 0;
 
@@ -541,8 +543,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
     itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep());
     bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep());
     flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep());
+    bytesTouchedrep_width = max(bytesrep_width, kernels[ik]->getBytesTouchedPerRep());
     bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep());
     bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep());
+    bytesModifyWrittenrep_width = max(bytesModifyWrittenrep_width, kernels[ik]->getBytesModifyWrittenPerRep());
     bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep());
   }
 
@@ -587,6 +591,12 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                          static_cast<Index_type>(frsize) ) + 3;
   dash_width += flopsrep_width + static_cast<Index_type>(sepchr.size());
 
+  double btrsize = log10( static_cast<double>(bytesTouchedrep_width) );
+  string bytesTouchedrep_head("BytesTouched/rep");
+  bytesTouchedrep_width = max( static_cast<Index_type>(bytesTouchedrep_head.size()),
+                        static_cast<Index_type>(btrsize) ) + 3;
+  dash_width += bytesTouchedrep_width + static_cast<Index_type>(sepchr.size());
+
   double brrsize = log10( static_cast<double>(bytesReadrep_width) );
   string bytesReadrep_head("BytesRead/rep");
   bytesReadrep_width = max( static_cast<Index_type>(bytesReadrep_head.size()),
@@ -599,6 +609,12 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                         static_cast<Index_type>(bwrsize) ) + 3;
   dash_width += bytesWrittenrep_width + static_cast<Index_type>(sepchr.size());
 
+  double bmwrsize = log10( static_cast<double>(bytesModifyWrittenrep_width) );
+  string bytesModifyWrittenrep_head("BytesModifyWritten/rep");
+  bytesModifyWrittenrep_width = max( static_cast<Index_type>(bytesModifyWrittenrep_head.size()),
+                        static_cast<Index_type>(bmwrsize) ) + 3;
+  dash_width += bytesModifyWrittenrep_width + static_cast<Index_type>(sepchr.size());
+
   double bamrrsize = log10( static_cast<double>(bytesAtomicModifyWrittenrep_width) );
   string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep");
   bytesAtomicModifyWrittenrep_width = max( static_cast<Index_type>(bytesAtomicModifyWrittenrep_head.size()),
@@ -612,8 +628,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
       << sepchr <<right<< setw(kernsrep_width) << kernsrep_head
       << sepchr <<right<< setw(bytesrep_width) << bytesrep_head
       << sepchr <<right<< setw(flopsrep_width) << flopsrep_head
+      << sepchr <<right<< setw(bytesTouchedrep_width) << bytesTouchedrep_head
       << sepchr <<right<< setw(bytesReadrep_width) << bytesReadrep_head
       << sepchr <<right<< setw(bytesWrittenrep_width) << bytesWrittenrep_head
+      << sepchr <<right<< setw(bytesModifyWrittenrep_width) << bytesModifyWrittenrep_head
       << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << bytesAtomicModifyWrittenrep_head
       << endl;
 
@@ -633,8 +651,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
         << sepchr <<right<< setw(kernsrep_width) << kern->getKernelsPerRep()
         << sepchr <<right<< setw(bytesrep_width) << kern->getBytesPerRep()
         << sepchr <<right<< setw(flopsrep_width) << kern->getFLOPsPerRep()
+        << sepchr <<right<< setw(bytesTouchedrep_width) << kern->getBytesTouchedPerRep()
         << sepchr <<right<< setw(bytesReadrep_width) << kern->getBytesReadPerRep()
         << sepchr <<right<< setw(bytesWrittenrep_width) << kern->getBytesWrittenPerRep()
+        << sepchr <<right<< setw(bytesModifyWrittenrep_width) << kern->getBytesModifyWrittenPerRep()
         << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << kern->getBytesAtomicModifyWrittenPerRep()
         << endl;
   }
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 9cefeaa66..0e1cb1073 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -56,6 +56,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
   kernels_per_rep = -1;
   bytes_read_per_rep = -1;
   bytes_written_per_rep = -1;
+  bytes_modify_written_per_rep = -1;
   bytes_atomic_modify_written_per_rep = -1;
   FLOPs_per_rep = -1;
 
@@ -87,6 +88,10 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                          CALI_ATTR_ASVALUE |
                                          CALI_ATTR_AGGREGATABLE |
                                          CALI_ATTR_SKIP_EVENTS);
+  Bytes_Touched_Rep_attr = cali_create_attribute("BytesTouched/Rep", CALI_TYPE_INT,
+                                                 CALI_ATTR_ASVALUE |
+                                                 CALI_ATTR_AGGREGATABLE |
+                                                 CALI_ATTR_SKIP_EVENTS);
   Bytes_Read_Rep_attr = cali_create_attribute("BytesRead/Rep", CALI_TYPE_INT,
                                               CALI_ATTR_ASVALUE |
                                               CALI_ATTR_AGGREGATABLE |
@@ -95,6 +100,10 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                                  CALI_ATTR_ASVALUE |
                                                  CALI_ATTR_AGGREGATABLE |
                                                  CALI_ATTR_SKIP_EVENTS);
+  Bytes_ModifyWritten_Rep_attr = cali_create_attribute("BytesModifyWritten/Rep", CALI_TYPE_INT,
+                                                       CALI_ATTR_ASVALUE |
+                                                       CALI_ATTR_AGGREGATABLE |
+                                                       CALI_ATTR_SKIP_EVENTS);
   Bytes_AtomicModifyWritten_Rep_attr = cali_create_attribute("BytesAtomicModifyWritten/Rep", CALI_TYPE_INT,
                                                              CALI_ATTR_ASVALUE |
                                                              CALI_ATTR_AGGREGATABLE |
@@ -384,6 +393,7 @@ void KernelBase::print(std::ostream& os) const
   os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl;
   os << "\t\t\t bytes_read_per_rep = " << bytes_read_per_rep << std::endl;
   os << "\t\t\t bytes_written_per_rep = " << bytes_written_per_rep << std::endl;
+  os << "\t\t\t bytes_modify_written_per_rep = " << bytes_modify_written_per_rep << std::endl;
   os << "\t\t\t bytes_atomic_modify_written_per_rep = " << bytes_atomic_modify_written_per_rep << std::endl;
   os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl;
   os << "\t\t\t num_exec: " << std::endl;
@@ -443,8 +453,10 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
     cali_set_helper(Iters_Rep_attr, getItsPerRep());
     cali_set_helper(Kernels_Rep_attr, getKernelsPerRep());
     cali_set_helper(Bytes_Rep_attr, getBytesPerRep());
+    cali_set_helper(Bytes_Touched_Rep_attr, getBytesTouchedPerRep());
     cali_set_helper(Bytes_Read_Rep_attr, getBytesReadPerRep());
     cali_set_helper(Bytes_Written_Rep_attr, getBytesWrittenPerRep());
+    cali_set_helper(Bytes_ModifyWritten_Rep_attr, getBytesModifyWrittenPerRep());
     cali_set_helper(Bytes_AtomicModifyWritten_Rep_attr, getBytesAtomicModifyWrittenPerRep());
     cali_set_helper(Flops_Rep_attr, getFLOPsPerRep());
     cali_set_helper(BlockSize_attr, getBlockSize());
@@ -495,8 +507,10 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(max#BytesTouched/Rep)", "as": "BytesTouched/Rep" },
           { "expr": "any(max#BytesRead/Rep)", "as": "BytesRead/Rep" },
           { "expr": "any(max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(max#BytesModifyWritten/Rep)", "as": "BytesModifyWritten/Rep" },
           { "expr": "any(max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" },
           { "expr": "any(max#BlockSize)", "as": "BlockSize" },
@@ -522,8 +536,10 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(any#max#BytesTouched/Rep)", "as": "BytesTouched/Rep" },
           { "expr": "any(any#max#BytesRead/Rep)", "as": "BytesRead/Rep" },
           { "expr": "any(any#max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(any#max#BytesModifyWritten/Rep)", "as": "BytesModifyWritten/Rep" },
           { "expr": "any(any#max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" },
           { "expr": "any(any#max#BlockSize)", "as": "BlockSize" },
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 1fa6f695b..7cd4df60c 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -118,6 +118,7 @@ class KernelBase
   void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; };
   void setBytesReadPerRep(Index_type bytes) { bytes_read_per_rep = bytes;}
   void setBytesWrittenPerRep(Index_type bytes) { bytes_written_per_rep = bytes;}
+  void setBytesModifyWrittenPerRep(Index_type bytes) { bytes_modify_written_per_rep = bytes;}
   void setBytesAtomicModifyWrittenPerRep(Index_type bytes) { bytes_atomic_modify_written_per_rep = bytes;}
   void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; }
   void setBlockSize(Index_type size) { kernel_block_size = size; }
@@ -200,9 +201,11 @@ class KernelBase
   Index_type getDefaultReps() const { return default_reps; }
   Index_type getItsPerRep() const { return its_per_rep; };
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
-  Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting
-  Index_type getBytesReadPerRep() const { return bytes_read_per_rep; }
-  Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; }
+  Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_modify_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count modify_write operations twice to get the memory traffic
+  Index_type getBytesTouchedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + bytes_modify_written_per_rep + bytes_atomic_modify_written_per_rep; } // count modify_write operations once to get the data size only
+  Index_type getBytesReadPerRep() const { return bytes_read_per_rep + bytes_modify_written_per_rep; }
+  Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep + bytes_modify_written_per_rep; }
+  Index_type getBytesModifyWrittenPerRep() const { return bytes_modify_written_per_rep; }
   Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }
   double getBlockSize() const { return kernel_block_size; }
@@ -672,6 +675,7 @@ class KernelBase
   Index_type kernels_per_rep;
   Index_type bytes_read_per_rep;
   Index_type bytes_written_per_rep;
+  Index_type bytes_modify_written_per_rep;
   Index_type bytes_atomic_modify_written_per_rep;
   Index_type FLOPs_per_rep;
   double kernel_block_size = nan(""); // Set default value for non GPU kernels
@@ -691,8 +695,10 @@ class KernelBase
   cali_id_t Iters_Rep_attr;
   cali_id_t Kernels_Rep_attr;
   cali_id_t Bytes_Rep_attr;
+  cali_id_t Bytes_Touched_Rep_attr;
   cali_id_t Bytes_Read_Rep_attr;
   cali_id_t Bytes_Written_Rep_attr;
+  cali_id_t Bytes_ModifyWritten_Rep_attr;
   cali_id_t Bytes_AtomicModifyWritten_Rep_attr;
   cali_id_t Flops_Rep_attr;
   cali_id_t BlockSize_attr;
diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp
index 205fb620e..f858cc988 100644
--- a/src/lcals/DIFF_PREDICT.cpp
+++ b/src/lcals/DIFF_PREDICT.cpp
@@ -28,8 +28,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
 
   setKernelsPerRep(1);
-  setBytesReadPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // cx(4)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // px(13)
+  setBytesModifyWrittenPerRep( 9*sizeof(Real_type) * getActualProblemSize() ); // px(4), px(5), px(6), px(7), px(8), px(9), px(10), px(11), px(12)
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(9 * getActualProblemSize());
 
diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp
index e3e5a1a8a..5ac4acc05 100644
--- a/src/lcals/EOS.cpp
+++ b/src/lcals/EOS.cpp
@@ -31,9 +31,10 @@ EOS::EOS(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() +
-                      1*sizeof(Real_type) * m_array_length );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() + // z, y
+                      1*sizeof(Real_type) * m_array_length ); // u (each iterate accesses the range [i, i+6])
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(16 * getActualProblemSize());
 
diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp
index f16cf2f7f..d9f303bb2 100644
--- a/src/lcals/FIRST_DIFF.cpp
+++ b/src/lcals/FIRST_DIFF.cpp
@@ -30,8 +30,9 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_N ); // y (each iterate accesses the range [i, i+1])
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp
index d7cb8782e..8a2c0ba09 100644
--- a/src/lcals/FIRST_MIN.cpp
+++ b/src/lcals/FIRST_MIN.cpp
@@ -33,11 +33,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) +
-                      1*sizeof(Real_type ) +
-                      1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) +
-                         1*sizeof(Real_type ) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp
index a31e98865..a4d223d95 100644
--- a/src/lcals/FIRST_SUM.cpp
+++ b/src/lcals/FIRST_SUM.cpp
@@ -30,8 +30,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params)
 
   setItsPerRep( m_N-1 );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N ); // y (each iterate accesses the range [i-1, i])
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * (m_N-1));
 
diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp
index 74d8ce2bd..54bd21011 100644
--- a/src/lcals/GEN_LIN_RECUR.cpp
+++ b/src/lcals/GEN_LIN_RECUR.cpp
@@ -30,10 +30,12 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params)
 
   setItsPerRep( 2 * m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * m_N +
-                      3*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
-                         2*sizeof(Real_type ) * m_N );
+  setBytesReadPerRep( 2*sizeof(Real_type ) * m_N + // sa, sb
+                      2*sizeof(Real_type ) * m_N ); // sa, sb
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + // b5
+                         1*sizeof(Real_type ) * m_N ); // b5
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type ) * m_N + // stb5
+                               1*sizeof(Real_type ) * m_N ); // stb5
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((3 +
                   3 ) * m_N);
diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp
index ec44c3a69..0b1bb98e9 100644
--- a/src/lcals/HYDRO_1D.cpp
+++ b/src/lcals/HYDRO_1D.cpp
@@ -30,9 +30,10 @@ HYDRO_1D::HYDRO_1D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() +
-                      1*sizeof(Real_type ) * (getActualProblemSize()+1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() + // y
+                      1*sizeof(Real_type ) * (getActualProblemSize()+1) ); // z (each iterate accesses the range [i+10, i+11])
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(5 * getActualProblemSize());
 
diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp
index 9d6b2d7e6..29058c557 100644
--- a/src/lcals/HYDRO_2D.cpp
+++ b/src/lcals/HYDRO_2D.cpp
@@ -40,18 +40,23 @@ HYDRO_2D::HYDRO_2D(const RunParams& params)
 
   setItsPerRep( 3 * (m_kn-2) * (m_jn-2) );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 4*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) +
+  setBytesReadPerRep( 2*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) + // zp, zq (4 point stencil)
+                      2*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) + // zr, zm (3 point stencil)
 
-                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
-                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-1) +
-                      2*sizeof(Real_type ) * ((m_kn) * (m_jn) - 4) +
+                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-1) + // za, zb (2 point stencil)
+                      2*sizeof(Real_type ) * ((m_kn) * (m_jn) - 4) + // zz, zr (5 point stencil)
 
-                      4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                      4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); // zr, zu, zz, zv
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + // za, zb
 
-                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                         0 +
 
-                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
+                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); // zrout, zzout
+  setBytesModifyWrittenPerRep( 0 +
+
+                               2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + // zu, zv
+
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((14 +
                   26 +
diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp
index 470265d7c..0d5beaa64 100644
--- a/src/lcals/INT_PREDICT.cpp
+++ b/src/lcals/INT_PREDICT.cpp
@@ -28,8 +28,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() ); // px(12), px(11), px(10), px(9), px(8), px(7), px(6), px(4), px(5), px(2)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // px(0)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(17 * getActualProblemSize());
 
diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp
index d1316fa73..76f7fbedd 100644
--- a/src/lcals/PLANCKIAN.cpp
+++ b/src/lcals/PLANCKIAN.cpp
@@ -28,8 +28,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() ); // u, v, x
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() ); // y, w
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp
 
diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp
index e99b8e908..9fd6855b2 100644
--- a/src/lcals/TRIDIAG_ELIM.cpp
+++ b/src/lcals/TRIDIAG_ELIM.cpp
@@ -30,8 +30,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params)
 
   setItsPerRep( m_N-1 );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) ); // z, y, xin
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); // xout
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * (m_N-1));
 
diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp
index e5046dc46..90738cc96 100644
--- a/src/polybench/POLYBENCH_2MM.cpp
+++ b/src/polybench/POLYBENCH_2MM.cpp
@@ -44,14 +44,15 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_ni*m_nl );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk + // B
 
-                      1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_nj * m_nl );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_ni * m_nj + // tmp
+                      1*sizeof(Real_type ) * m_nj * m_nl ); // C
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + // tmp
 
-                         1*sizeof(Real_type ) * m_ni * m_nl );
+                         1*sizeof(Real_type ) * m_ni * m_nl ); // D
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * m_ni*m_nj*m_nk +
                  2 * m_ni*m_nj*m_nl );
diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp
index af208551b..f17e4a999 100644
--- a/src/polybench/POLYBENCH_3MM.cpp
+++ b/src/polybench/POLYBENCH_3MM.cpp
@@ -47,19 +47,20 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk + // B
 
-                      1*sizeof(Real_type ) * m_nj * m_nm +
-                      1*sizeof(Real_type ) * m_nl * m_nm +
+                      1*sizeof(Real_type ) * m_nj * m_nm + // C
+                      1*sizeof(Real_type ) * m_nl * m_nm + // D
 
-                      1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_nj * m_nl );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_ni * m_nj + // E
+                      1*sizeof(Real_type ) * m_nj * m_nl ); // F
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + // E
 
-                         1*sizeof(Real_type ) * m_nj * m_nl +
+                         1*sizeof(Real_type ) * m_nj * m_nl + // F
 
-                         1*sizeof(Real_type ) * m_ni * m_nl );
+                         1*sizeof(Real_type ) * m_ni * m_nl ); // G
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_ni*m_nj*m_nk +
                  2 * m_nj*m_nl*m_nm +
diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp
index 95e679cfd..3da785dee 100644
--- a/src/polybench/POLYBENCH_ADI.cpp
+++ b/src/polybench/POLYBENCH_ADI.cpp
@@ -34,16 +34,17 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   setActualProblemSize( (m_n-2) * (m_n-2) );
 
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
-                      2*sizeof(Real_type ) * (m_n-2) * (m_n-2) +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_n-2) * (m_n  ) + // u
 
-                      1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
-                      2*sizeof(Real_type ) * (m_n-2) * (m_n-2) );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (m_n-1) +
-                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
+                      1*sizeof(Real_type ) * (m_n-2) * (m_n  ) ); // v
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (    1) + // p, q
+                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) + // v
 
-                         2*sizeof(Real_type ) * (m_n-2) * (m_n-1) +
-                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) );
+                         2*sizeof(Real_type ) * (m_n-2) * (    1) + // p, q
+                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) ); // u
+  setBytesModifyWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (m_n-2) + // p, q
+
+                               2*sizeof(Real_type ) * (m_n-2) * (m_n-2) ); // p, q
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( (13 + 2) * (m_n-2)*(m_n-2) +
                   (13 + 2) * (m_n-2)*(m_n-2) );
diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp
index bb3100060..1d28e2d14 100644
--- a/src/polybench/POLYBENCH_ATAX.cpp
+++ b/src/polybench/POLYBENCH_ATAX.cpp
@@ -33,13 +33,17 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
 
   setItsPerRep( 2 * m_N + m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +       // x
+                      1*sizeof(Real_type ) * m_N * m_N + // A
 
-                      2*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
-                         1*sizeof(Real_type ) * m_N);
+                      1*sizeof(Real_type ) * m_N +        // tmp
+                      1*sizeof(Real_type ) * m_N * m_N ); // A
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + // y, tmp
+
+                         0);
+  setBytesModifyWrittenPerRep( 0 +
+
+                               1*sizeof(Real_type ) * m_N ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp
index aea03e4f2..fe3929ba2 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.cpp
@@ -43,24 +43,28 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
                 m_nx*(m_ny-1) +
                 (m_nx-1)*(m_ny-1) );
   setKernelsPerRep(4);
-  setBytesReadPerRep( 1*sizeof(Real_type ) +
+  setBytesReadPerRep( 1*sizeof(Real_type ) + // fict
 
-                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
-                      1*sizeof(Real_type ) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * m_nx * m_ny + // hz
 
-                      1*sizeof(Real_type ) * m_nx * (m_ny-1) +
-                      1*sizeof(Real_type ) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * m_nx * m_ny + // hz
 
-                      1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) +
-                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
-                      1*sizeof(Real_type ) * m_nx * (m_ny-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ny +
+                      1*sizeof(Real_type ) * (m_nx-1) * m_ny + // ex
+                      1*sizeof(Real_type ) * m_nx * (m_ny-1) ); // ey
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ny + // ey
 
-                         1*sizeof(Real_type ) * (m_nx-1) * m_ny +
+                         0 +
 
-                         1*sizeof(Real_type ) * m_nx * (m_ny-1) +
+                         0 +
 
-                         1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) );
+                         0 ); // hz
+  setBytesModifyWrittenPerRep( 0 +
+
+                               1*sizeof(Real_type ) * (m_nx-1) * m_ny + // ey
+
+                               1*sizeof(Real_type ) * m_nx * (m_ny-1) + // ex
+
+                               1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) ); // hz
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 0 * m_ny +
                   3 * (m_nx-1)*m_ny +
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
index 08dcb5e57..0f01afdea 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
@@ -33,8 +33,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params)
 
   setItsPerRep( m_N * m_N*m_N );
   setKernelsPerRep(m_N);
-  setBytesReadPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N );
+  setBytesReadPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N ); // pin
+  setBytesWrittenPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N ); // pout
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_N*m_N*m_N * 3 / 2 ); // conditional is true about half of the time
 
diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp
index e1850135b..9ef30c647 100644
--- a/src/polybench/POLYBENCH_GEMM.cpp
+++ b/src/polybench/POLYBENCH_GEMM.cpp
@@ -40,10 +40,10 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
 
   setItsPerRep( m_ni * m_nj );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj);
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk ); // B
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj); // C
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((1 +
                   3 * m_nk) * m_ni*m_nj);
diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp
index 7ea31efea..82d584c0e 100644
--- a/src/polybench/POLYBENCH_GEMVER.cpp
+++ b/src/polybench/POLYBENCH_GEMVER.cpp
@@ -39,23 +39,29 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params)
                 m_n +
                 m_n );
   setKernelsPerRep(4);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_n * m_n +
-                      4*sizeof(Real_type ) * m_n +
+  setBytesReadPerRep( 4*sizeof(Real_type ) * m_n + // u1, v1, u2, v2
 
-                      1*sizeof(Real_type ) * m_n * m_n +
-                      1*sizeof(Real_type ) * m_n +
+                      1*sizeof(Real_type ) * m_n * m_n + // A
+                      1*sizeof(Real_type ) * m_n + // y
 
-                      2*sizeof(Real_type ) * m_n +
+                      1*sizeof(Real_type ) * m_n + // z
 
-                      1*sizeof(Real_type ) * m_n * m_n +
-                      2*sizeof(Real_type ) * m_n );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n +
+                      1*sizeof(Real_type ) * m_n * m_n + // A
+                      2*sizeof(Real_type ) * m_n ); // x
+  setBytesWrittenPerRep( 0 +
 
-                         1*sizeof(Real_type ) * m_n +
+                         1*sizeof(Real_type ) * m_n + // x
 
-                         1*sizeof(Real_type ) * m_n +
+                         0 +
 
-                         1*sizeof(Real_type ) * m_n );
+                         1*sizeof(Real_type ) * m_n ); // w
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n + // A
+
+                               0 +
+
+                               1*sizeof(Real_type ) * m_n + // x
+
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * m_n*m_n +
                  3 * m_n*m_n +
diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp
index 89ae3fd4a..690f950f8 100644
--- a/src/polybench/POLYBENCH_GESUMMV.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV.cpp
@@ -36,9 +36,10 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
 
   setItsPerRep( m_N );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      2*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // x
+                      2*sizeof(Real_type ) * m_N * m_N ); // A, B
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((4 * m_N +
                   3 ) * m_N  );
diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp
index d8150325f..cf7da4db6 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.cpp
@@ -34,10 +34,13 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) +
-                      1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8));
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) + // A (7 point stencil)
+
+                      1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8)); // B (7 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 15 * (m_N-2) * (m_N-2) * (m_N-2) +
                   15 * (m_N-2) * (m_N-2) * (m_N-2) );
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp
index bad9a047c..ce587a6d7 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp
@@ -33,10 +33,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // A (3 point stencil)
+
+                      1*sizeof(Real_type ) * m_N ); // B (3 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 3 * (m_N-2) +
                   3 * (m_N-2) );
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp
index 08f04bd4e..3630d765b 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp
@@ -33,10 +33,13 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params)
 
   setItsPerRep( 2 * (m_N-2) * (m_N-2) );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N - 4) +
-                      1*sizeof(Real_type ) * (m_N * m_N - 4) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N - 4) + // A (5 point stencil)
+
+                      1*sizeof(Real_type ) * (m_N * m_N - 4) ); // B (5 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 5 * (m_N-2)*(m_N-2) +
                   5 * (m_N-2)*(m_N-2) );
diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp
index 8a35c2f65..2292f6bc3 100644
--- a/src/polybench/POLYBENCH_MVT.cpp
+++ b/src/polybench/POLYBENCH_MVT.cpp
@@ -33,14 +33,15 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params)
 
   setItsPerRep( 2 * m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // y1
+                      1*sizeof(Real_type ) * m_N * m_N + // A
 
-                      1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N + // y2
+                      1*sizeof(Real_type ) * m_N * m_N ); // A
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + // x1
 
-                         1*sizeof(Real_type ) * m_N );
+                         1*sizeof(Real_type ) * m_N ); // x2
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp
index b78bc35f4..bec04786c 100644
--- a/src/stream/ADD.cpp
+++ b/src/stream/ADD.cpp
@@ -28,8 +28,9 @@ ADD::ADD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // a, b
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // c
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp
index 206acf31b..71037f6cf 100644
--- a/src/stream/COPY.cpp
+++ b/src/stream/COPY.cpp
@@ -28,8 +28,9 @@ COPY::COPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // c
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp
index a3d782248..34a0ca5ef 100644
--- a/src/stream/DOT.cpp
+++ b/src/stream/DOT.cpp
@@ -28,9 +28,9 @@ DOT::DOT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) +
-                      2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // a, b
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp
index dffbf8528..5ccac63c0 100644
--- a/src/stream/MUL.cpp
+++ b/src/stream/MUL.cpp
@@ -28,8 +28,9 @@ MUL::MUL(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // c
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // b
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp
index b61be0a2f..e66f5188e 100644
--- a/src/stream/TRIAD.cpp
+++ b/src/stream/TRIAD.cpp
@@ -28,8 +28,9 @@ TRIAD::TRIAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // b, c
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());