From 3af24bebac6feb8ce68a91ceb14027150c563fd7 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Dec 2025 08:05:31 -0800
Subject: [PATCH 1/5] Add modify written and bytes touched

Now the memory accesses are split into read, write, modify write,
and atomic modify write categories. Each memory address touched
should only appear in exactly one of these memory attributes
per loop-next/kernel launch.

Continue to output Bytes which is the total memory traffic. This is
calculated via (read + write + 2*(modify write + atomic modify write)).

Also output BytesTouched which is the amount of memory used. This is
calculated as the sum of the 4 categories of memory accesses.

These numbers are idealized and real memory traffic may be higher if
perfect caching is not achieved. They are also sometimes estimates
as some kernels have conditionals that rely on random numbers or
complex implementations.
---
 src/common/Executor.cpp   | 20 ++++++++++++++++++++
 src/common/KernelBase.cpp | 16 ++++++++++++++++
 src/common/KernelBase.hpp |  8 +++++++-
 3 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 7805376bb..f79e8203b 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -529,8 +529,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
   Index_type itsrep_width = 0;
   Index_type bytesrep_width = 0;
   Index_type flopsrep_width = 0;
+  Index_type bytesTouchedrep_width = 0;
   Index_type bytesReadrep_width = 0;
   Index_type bytesWrittenrep_width = 0;
+  Index_type bytesModifyWrittenrep_width = 0;
   Index_type bytesAtomicModifyWrittenrep_width = 0;
   Index_type dash_width = 0;
 
@@ -541,8 +543,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
     itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep());
     bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep());
     flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep());
+    bytesTouchedrep_width = max(bytesrep_width, kernels[ik]->getBytesTouchedPerRep());
     bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep());
     bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep());
+    bytesModifyWrittenrep_width = max(bytesModifyWrittenrep_width, kernels[ik]->getBytesModifyWrittenPerRep());
     bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep());
   }
 
@@ -587,6 +591,12 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                          static_cast<Index_type>(frsize) ) + 3;
   dash_width += flopsrep_width + static_cast<Index_type>(sepchr.size());
 
+  double btrsize = log10( static_cast<double>(bytesTouchedrep_width) );
+  string bytesTouchedrep_head("BytesTouched/rep");
+  bytesTouchedrep_width = max( static_cast<Index_type>(bytesTouchedrep_head.size()),
+                        static_cast<Index_type>(btrsize) ) + 3;
+  dash_width += bytesTouchedrep_width + static_cast<Index_type>(sepchr.size());
+
   double brrsize = log10( static_cast<double>(bytesReadrep_width) );
   string bytesReadrep_head("BytesRead/rep");
   bytesReadrep_width = max( static_cast<Index_type>(bytesReadrep_head.size()),
@@ -599,6 +609,12 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
                         static_cast<Index_type>(bwrsize) ) + 3;
   dash_width += bytesWrittenrep_width + static_cast<Index_type>(sepchr.size());
 
+  double bmwrsize = log10( static_cast<double>(bytesModifyWrittenrep_width) );
+  string bytesModifyWrittenrep_head("BytesModifyWritten/rep");
+  bytesModifyWrittenrep_width = max( static_cast<Index_type>(bytesModifyWrittenrep_head.size()),
+                        static_cast<Index_type>(bmwrsize) ) + 3;
+  dash_width += bytesModifyWrittenrep_width + static_cast<Index_type>(sepchr.size());
+
   double bamrrsize = log10( static_cast<double>(bytesAtomicModifyWrittenrep_width) );
   string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep");
   bytesAtomicModifyWrittenrep_width = max( static_cast<Index_type>(bytesAtomicModifyWrittenrep_head.size()),
@@ -612,8 +628,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
       << sepchr <<right<< setw(kernsrep_width) << kernsrep_head
       << sepchr <<right<< setw(bytesrep_width) << bytesrep_head
       << sepchr <<right<< setw(flopsrep_width) << flopsrep_head
+      << sepchr <<right<< setw(bytesTouchedrep_width) << bytesTouchedrep_head
       << sepchr <<right<< setw(bytesReadrep_width) << bytesReadrep_head
       << sepchr <<right<< setw(bytesWrittenrep_width) << bytesWrittenrep_head
+      << sepchr <<right<< setw(bytesModifyWrittenrep_width) << bytesModifyWrittenrep_head
       << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << bytesAtomicModifyWrittenrep_head
       << endl;
 
@@ -633,8 +651,10 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const
         << sepchr <<right<< setw(kernsrep_width) << kern->getKernelsPerRep()
         << sepchr <<right<< setw(bytesrep_width) << kern->getBytesPerRep()
         << sepchr <<right<< setw(flopsrep_width) << kern->getFLOPsPerRep()
+        << sepchr <<right<< setw(bytesTouchedrep_width) << kern->getBytesTouchedPerRep()
         << sepchr <<right<< setw(bytesReadrep_width) << kern->getBytesReadPerRep()
         << sepchr <<right<< setw(bytesWrittenrep_width) << kern->getBytesWrittenPerRep()
+        << sepchr <<right<< setw(bytesModifyWrittenrep_width) << kern->getBytesModifyWrittenPerRep()
         << sepchr <<right<< setw(bytesAtomicModifyWrittenrep_width) << kern->getBytesAtomicModifyWrittenPerRep()
         << endl;
   }
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 873ef6e40..663491ba7 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -44,6 +44,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
   kernels_per_rep = -1;
   bytes_read_per_rep = -1;
   bytes_written_per_rep = -1;
+  bytes_modify_written_per_rep = -1;
   bytes_atomic_modify_written_per_rep = -1;
   FLOPs_per_rep = -1;
 
@@ -75,6 +76,10 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                          CALI_ATTR_ASVALUE |
                                          CALI_ATTR_AGGREGATABLE |
                                          CALI_ATTR_SKIP_EVENTS);
+  Bytes_Touched_Rep_attr = cali_create_attribute("BytesTouched/Rep", CALI_TYPE_INT,
+                                                 CALI_ATTR_ASVALUE |
+                                                 CALI_ATTR_AGGREGATABLE |
+                                                 CALI_ATTR_SKIP_EVENTS);
   Bytes_Read_Rep_attr = cali_create_attribute("BytesRead/Rep", CALI_TYPE_INT,
                                               CALI_ATTR_ASVALUE |
                                               CALI_ATTR_AGGREGATABLE |
@@ -83,6 +88,10 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                                  CALI_ATTR_ASVALUE |
                                                  CALI_ATTR_AGGREGATABLE |
                                                  CALI_ATTR_SKIP_EVENTS);
+  Bytes_ModifyWritten_Rep_attr = cali_create_attribute("BytesModifyWritten/Rep", CALI_TYPE_INT,
+                                                       CALI_ATTR_ASVALUE |
+                                                       CALI_ATTR_AGGREGATABLE |
+                                                       CALI_ATTR_SKIP_EVENTS);
   Bytes_AtomicModifyWritten_Rep_attr = cali_create_attribute("BytesAtomicModifyWritten/Rep", CALI_TYPE_INT,
                                                              CALI_ATTR_ASVALUE |
                                                              CALI_ATTR_AGGREGATABLE |
@@ -367,6 +376,7 @@ void KernelBase::print(std::ostream& os) const
   os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl;
   os << "\t\t\t bytes_read_per_rep = " << bytes_read_per_rep << std::endl;
   os << "\t\t\t bytes_written_per_rep = " << bytes_written_per_rep << std::endl;
+  os << "\t\t\t bytes_modify_written_per_rep = " << bytes_modify_written_per_rep << std::endl;
   os << "\t\t\t bytes_atomic_modify_written_per_rep = " << bytes_atomic_modify_written_per_rep << std::endl;
   os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl;
   os << "\t\t\t num_exec: " << std::endl;
@@ -426,8 +436,10 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
     cali_set_helper(Iters_Rep_attr, getItsPerRep());
     cali_set_helper(Kernels_Rep_attr, getKernelsPerRep());
     cali_set_helper(Bytes_Rep_attr, getBytesPerRep());
+    cali_set_helper(Bytes_Touched_Rep_attr, getBytesTouchedPerRep());
     cali_set_helper(Bytes_Read_Rep_attr, getBytesReadPerRep());
     cali_set_helper(Bytes_Written_Rep_attr, getBytesWrittenPerRep());
+    cali_set_helper(Bytes_ModifyWritten_Rep_attr, getBytesModifyWrittenPerRep());
     cali_set_helper(Bytes_AtomicModifyWritten_Rep_attr, getBytesAtomicModifyWrittenPerRep());
     cali_set_helper(Flops_Rep_attr, getFLOPsPerRep());
     cali_set_helper(BlockSize_attr, getBlockSize());
@@ -477,8 +489,10 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(max#BytesTouched/Rep)", "as": "BytesTouched/Rep" },
           { "expr": "any(max#BytesRead/Rep)", "as": "BytesRead/Rep" },
           { "expr": "any(max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(max#BytesModifyWritten/Rep)", "as": "BytesModifyWritten/Rep" },
           { "expr": "any(max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" },
           { "expr": "any(max#BlockSize)", "as": "BlockSize" },
@@ -504,8 +518,10 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid,
           { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" },
           { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" },
           { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" },
+          { "expr": "any(any#max#BytesTouched/Rep)", "as": "BytesTouched/Rep" },
           { "expr": "any(any#max#BytesRead/Rep)", "as": "BytesRead/Rep" },
           { "expr": "any(any#max#BytesWritten/Rep)", "as": "BytesWritten/Rep" },
+          { "expr": "any(any#max#BytesModifyWritten/Rep)", "as": "BytesModifyWritten/Rep" },
           { "expr": "any(any#max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" },
           { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" },
           { "expr": "any(any#max#BlockSize)", "as": "BlockSize" },
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 95889333e..64021d887 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -109,6 +109,7 @@ class KernelBase
   void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; };
   void setBytesReadPerRep(Index_type bytes) { bytes_read_per_rep = bytes;}
   void setBytesWrittenPerRep(Index_type bytes) { bytes_written_per_rep = bytes;}
+  void setBytesModifyWrittenPerRep(Index_type bytes) { bytes_modify_written_per_rep = bytes;}
   void setBytesAtomicModifyWrittenPerRep(Index_type bytes) { bytes_atomic_modify_written_per_rep = bytes;}
   void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; }
   void setBlockSize(Index_type size) { kernel_block_size = size; }
@@ -190,9 +191,11 @@ class KernelBase
   Index_type getDefaultReps() const { return default_reps; }
   Index_type getItsPerRep() const { return its_per_rep; };
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
-  Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting
+  Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_modify_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count modify_write operations twice to get the memory traffic
+  Index_type getBytesTouchedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + bytes_modify_written_per_rep + bytes_atomic_modify_written_per_rep; } // count modify_write operations once to get the data size only
   Index_type getBytesReadPerRep() const { return bytes_read_per_rep; }
   Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; }
+  Index_type getBytesModifyWrittenPerRep() const { return bytes_modify_written_per_rep; }
   Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }
   double getBlockSize() const { return kernel_block_size; }
@@ -653,6 +656,7 @@ class KernelBase
   Index_type kernels_per_rep;
   Index_type bytes_read_per_rep;
   Index_type bytes_written_per_rep;
+  Index_type bytes_modify_written_per_rep;
   Index_type bytes_atomic_modify_written_per_rep;
   Index_type FLOPs_per_rep;
   double kernel_block_size = nan(""); // Set default value for non GPU kernels
@@ -672,8 +676,10 @@ class KernelBase
   cali_id_t Iters_Rep_attr;
   cali_id_t Kernels_Rep_attr;
   cali_id_t Bytes_Rep_attr;
+  cali_id_t Bytes_Touched_Rep_attr;
   cali_id_t Bytes_Read_Rep_attr;
   cali_id_t Bytes_Written_Rep_attr;
+  cali_id_t Bytes_ModifyWritten_Rep_attr;
   cali_id_t Bytes_AtomicModifyWritten_Rep_attr;
   cali_id_t Flops_Rep_attr;
   cali_id_t BlockSize_attr;

From 95fbe355cf3320aa52406da3528061e9ec234e07 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Dec 2025 13:56:17 -0800
Subject: [PATCH 2/5] Set bytes modify written and label variables

Add the number of bytes modify written per kernel.
Label which variable(s) are accessed for each line of the
bytes read, written, modify written, or atomic modify written.
---
 src/algorithm/ATOMIC.cpp                   |  3 +-
 src/algorithm/HISTOGRAM.cpp                |  8 +++---
 src/algorithm/MEMCPY.cpp                   |  5 ++--
 src/algorithm/MEMCPY.hpp                   |  2 +-
 src/algorithm/MEMSET.cpp                   |  3 +-
 src/algorithm/REDUCE_SUM.cpp               |  5 ++--
 src/algorithm/SCAN.cpp                     |  5 ++--
 src/algorithm/SORT.cpp                     |  6 ++--
 src/algorithm/SORTPAIRS.cpp                |  6 ++--
 src/apps/CONVECTION3DPA.cpp                |  5 ++--
 src/apps/DEL_DOT_VEC_2D.cpp                |  7 +++--
 src/apps/DIFFUSION3DPA.cpp                 |  5 ++--
 src/apps/EDGE3D.cpp                        |  5 ++--
 src/apps/ENERGY.cpp                        | 33 +++++++++++++---------
 src/apps/FEMSWEEP.cpp                      | 14 ++++-----
 src/apps/FIR.cpp                           |  7 +++--
 src/apps/INTSC_HEXHEX.cpp                  |  1 +
 src/apps/INTSC_HEXRECT.cpp                 |  1 +
 src/apps/LTIMES.cpp                        |  8 +++---
 src/apps/LTIMES_NOVIEW.cpp                 |  8 +++---
 src/apps/MASS3DEA.cpp                      |  1 +
 src/apps/MASS3DPA.cpp                      |  5 ++--
 src/apps/MASSVEC3DPA.cpp                   |  1 +
 src/apps/MATVEC_3D_STENCIL.cpp             |  9 +++---
 src/apps/NODAL_ACCUMULATION_3D.cpp         |  7 +++--
 src/apps/PRESSURE.cpp                      |  9 +++---
 src/apps/VOL3D.cpp                         |  5 ++--
 src/apps/ZONAL_ACCUMULATION_3D.cpp         |  7 +++--
 src/basic/ARRAY_OF_PTRS.cpp                |  5 ++--
 src/basic/COPY8.cpp                        |  5 ++--
 src/basic/DAXPY.cpp                        |  5 ++--
 src/basic/DAXPY_ATOMIC.cpp                 |  5 ++--
 src/basic/EMPTY.cpp                        |  1 +
 src/basic/IF_QUAD.cpp                      |  5 ++--
 src/basic/INDEXLIST.cpp                    |  7 ++---
 src/basic/INDEXLIST_3LOOP.cpp              | 21 ++++++--------
 src/basic/INIT3.cpp                        |  5 ++--
 src/basic/INIT_VIEW1D.cpp                  |  3 +-
 src/basic/INIT_VIEW1D_OFFSET.cpp           |  3 +-
 src/basic/MAT_MAT_SHARED.cpp               |  5 ++--
 src/basic/MULADDSUB.cpp                    |  5 ++--
 src/basic/MULTI_REDUCE.cpp                 | 10 +++----
 src/basic/NESTED_INIT.cpp                  |  3 +-
 src/basic/PI_ATOMIC.cpp                    |  3 +-
 src/basic/PI_REDUCE.cpp                    |  5 ++--
 src/basic/REDUCE3_INT.cpp                  |  6 ++--
 src/basic/REDUCE_STRUCT.cpp                |  6 ++--
 src/basic/TRAP_INT.cpp                     |  5 ++--
 src/comm/HALO_EXCHANGE.cpp                 | 17 +++++------
 src/comm/HALO_EXCHANGE_FUSED.cpp           | 17 +++++------
 src/comm/HALO_PACKING.cpp                  | 13 +++++----
 src/comm/HALO_PACKING_FUSED.cpp            | 13 +++++----
 src/comm/HALO_SENDRECV.cpp                 |  5 ++--
 src/lcals/DIFF_PREDICT.cpp                 |  5 ++--
 src/lcals/EOS.cpp                          |  7 +++--
 src/lcals/FIRST_DIFF.cpp                   |  5 ++--
 src/lcals/FIRST_MIN.cpp                    |  8 ++----
 src/lcals/FIRST_SUM.cpp                    |  5 ++--
 src/lcals/GEN_LIN_RECUR.cpp                | 10 ++++---
 src/lcals/HYDRO_1D.cpp                     |  7 +++--
 src/lcals/HYDRO_2D.cpp                     | 21 ++++++++------
 src/lcals/INT_PREDICT.cpp                  |  5 ++--
 src/lcals/PLANCKIAN.cpp                    |  5 ++--
 src/lcals/TRIDIAG_ELIM.cpp                 |  5 ++--
 src/polybench/POLYBENCH_2MM.cpp            | 13 +++++----
 src/polybench/POLYBENCH_3MM.cpp            | 19 +++++++------
 src/polybench/POLYBENCH_ADI.cpp            | 17 +++++------
 src/polybench/POLYBENCH_ATAX.cpp           | 16 +++++++----
 src/polybench/POLYBENCH_FDTD_2D.cpp        | 28 ++++++++++--------
 src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp |  5 ++--
 src/polybench/POLYBENCH_GEMM.cpp           |  8 +++---
 src/polybench/POLYBENCH_GEMVER.cpp         | 28 ++++++++++--------
 src/polybench/POLYBENCH_GESUMMV.cpp        |  7 +++--
 src/polybench/POLYBENCH_HEAT_3D.cpp        | 11 +++++---
 src/polybench/POLYBENCH_JACOBI_1D.cpp      | 11 +++++---
 src/polybench/POLYBENCH_JACOBI_2D.cpp      | 11 +++++---
 src/polybench/POLYBENCH_MVT.cpp            | 13 +++++----
 src/stream/ADD.cpp                         |  5 ++--
 src/stream/COPY.cpp                        |  5 ++--
 src/stream/DOT.cpp                         |  6 ++--
 src/stream/MUL.cpp                         |  5 ++--
 src/stream/TRIAD.cpp                       |  5 ++--
 82 files changed, 374 insertions(+), 281 deletions(-)

diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp
index 1c05c9bb9..cab1e5b57 100644
--- a/src/algorithm/ATOMIC.cpp
+++ b/src/algorithm/ATOMIC.cpp
@@ -30,7 +30,8 @@ ATOMIC::ATOMIC(const RunParams& params)
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // atomic (assumes replication == problem size)
   setFLOPsPerRep(getActualProblemSize());
 
   setComplexity(Complexity::N);
diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp
index 974f415cc..e35467efc 100644
--- a/src/algorithm/HISTOGRAM.cpp
+++ b/src/algorithm/HISTOGRAM.cpp
@@ -34,10 +34,10 @@ HISTOGRAM::HISTOGRAM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
-                      1*sizeof(Index_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
-  setBytesAtomicModifyWrittenPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getActualProblemSize() ); // bins
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); // counts
   setFLOPsPerRep( (std::is_floating_point_v<Data_type> ? 1 : 0) * getActualProblemSize() );
 
   setComplexity(Complexity::N);
diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp
index 088d7c815..f10450969 100644
--- a/src/algorithm/MEMCPY.cpp
+++ b/src/algorithm/MEMCPY.cpp
@@ -28,8 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp
index 02f814788..7a20c056b 100644
--- a/src/algorithm/MEMCPY.hpp
+++ b/src/algorithm/MEMCPY.hpp
@@ -10,7 +10,7 @@
 /// MEMCPY kernel reference implementation:
 ///
 /// for (Index_type i = ibegin; i < iend; ++i ) {
-///   x[i] = val ;
+///   y[i] = x[i] ;
 /// }
 ///
 
diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp
index 2380d4d1b..d98f5d600 100644
--- a/src/algorithm/MEMSET.cpp
+++ b/src/algorithm/MEMSET.cpp
@@ -29,7 +29,8 @@ MEMSET::MEMSET(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp
index 5a86363a1..05a7708de 100644
--- a/src/algorithm/REDUCE_SUM.cpp
+++ b/src/algorithm/REDUCE_SUM.cpp
@@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(getActualProblemSize());
 
diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp
index ccb842f8b..8e66a8d82 100644
--- a/src/algorithm/SCAN.cpp
+++ b/src/algorithm/SCAN.cpp
@@ -28,8 +28,9 @@ SCAN::SCAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp
index 6b67c08d7..d8b2f3314 100644
--- a/src/algorithm/SORT.cpp
+++ b/src/algorithm/SORT.cpp
@@ -28,8 +28,10 @@ SORT::SORT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp
index 3747a629d..3189e5125 100644
--- a/src/algorithm/SORTPAIRS.cpp
+++ b/src/algorithm/SORTPAIRS.cpp
@@ -28,8 +28,10 @@ SORTPAIRS::SORTPAIRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
-  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm
+  // not useful in this case due to O(n*log(n)) algorithm
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x, i
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp
index 173094b28..12d709ea6 100644
--- a/src/apps/CONVECTION3DPA.cpp
+++ b/src/apps/CONVECTION3DPA.cpp
@@ -36,9 +36,10 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g
-                      2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x
                CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (
diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
index e665690c9..4291230db 100644
--- a/src/apps/DEL_DOT_VEC_2D.cpp
+++ b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate
-  setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      4*sizeof(Real_type) * m_domain->n_real_nodes ); // x, y, fx, fy (2d nodal stencil pattern: 4 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // div
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(54 * m_domain->n_real_zones);
 
diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp
index 59d81dff9..c85d0e697 100644
--- a/src/apps/DIFFUSION3DPA.cpp
+++ b/src/apps/DIFFUSION3DPA.cpp
@@ -36,9 +36,10 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g
-                      2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y
+                      1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x
                 DPA_SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D +
diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp
index 73fc13dd4..48e027571 100644
--- a/src/apps/EDGE3D.cpp
+++ b/src/apps/EDGE3D.cpp
@@ -40,8 +40,9 @@ EDGE3D::EDGE3D(const RunParams& params)
 
   // touched data size, not actual number of stores and loads
   // see VOL3D.cpp
-  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); // x, y, z (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // sum
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr size_t flops_k_loop = 15
diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
index 3e2c90f94..b7312517a 100644
--- a/src/apps/ENERGY.cpp
+++ b/src/apps/ENERGY.cpp
@@ -29,21 +29,28 @@ ENERGY::ENERGY(const RunParams& params)
   setItsPerRep( 6 * getActualProblemSize() );
   setKernelsPerRep(6);
   // some branches are never taken due to the nature of the initialization of delvc
-  // the additional reads that would be done if those branches were taken are noted in the comments
-  setBytesReadPerRep((5*sizeof(Real_type) +
-                      1*sizeof(Real_type) + // 8
-                      6*sizeof(Real_type) +
-                      2*sizeof(Real_type) +
-                      7*sizeof(Real_type) + // 12
-                      1*sizeof(Real_type)   // 8
+  // the additional ops that would be done if those branches were taken are noted in the comments
+  setBytesReadPerRep((5*sizeof(Real_type) + // e_old, delvc, p_old, q_old, work
+                      1*sizeof(Real_type) + // delvc (+7 : compHalfStep, pbvc, e_new, bvc, pHalfStep, ql_old, qq_old)
+                      5*sizeof(Real_type) + // delvc, p_old, q_old, pHalfStep, q_new
+                      1*sizeof(Real_type) + // work
+                      6*sizeof(Real_type) + // delvc p_old, q_old, pHalfStep, q_new, p_new (+5 : pbvc, vnewc, bvc, ql_old, qq_old )
+                      1*sizeof(Real_type)   // delvc (+7 : pbvc, e_new, vnewc, bvc, p_new, ql_old, qq_old )
                       ) * getActualProblemSize() );
-  setBytesWrittenPerRep((1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         1*sizeof(Real_type) +
-                         0*sizeof(Real_type)
+  setBytesWrittenPerRep((1*sizeof(Real_type) + // e_new
+                         1*sizeof(Real_type) + // q_new
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type) +
+                         0*sizeof(Real_type)   // (+1 : q_new )
                          ) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( (0*sizeof(Real_type) +
+                                0*sizeof(Real_type) +
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                1*sizeof(Real_type) + // e_new
+                                0*sizeof(Real_type)
+                                ) * getActualProblemSize() );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((6  +
                   11 + // 1 sqrt
diff --git a/src/apps/FEMSWEEP.cpp b/src/apps/FEMSWEEP.cpp
index 3dcc5e6f7..988b3f99d 100644
--- a/src/apps/FEMSWEEP.cpp
+++ b/src/apps/FEMSWEEP.cpp
@@ -48,13 +48,13 @@ FEMSWEEP::FEMSWEEP(const RunParams& params)
   setItsPerRep(1);
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen +
-                      1*sizeof(Real_type) * m_Alen +
-                      1*sizeof(Real_type) * m_Flen +
-                      1*sizeof(Real_type) * m_Sglen +
-                      1*sizeof(Real_type) * m_M0len +
-                      1*sizeof(Real_type) * m_Xlen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_Xlen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_Blen + // Bdat
+                      1*sizeof(Real_type) * m_Alen + // Adat
+                      1*sizeof(Real_type) * m_Flen + // Fdat
+                      1*sizeof(Real_type) * m_Sglen + // Sgdat
+                      1*sizeof(Real_type) * m_M0len ); // M0dat
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_Xlen ); // Xdat
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   // This is an estimate of the upper bound FLOPs.
diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
index a46be0453..ef7c7498b 100644
--- a/src/apps/FIR.cpp
+++ b/src/apps/FIR.cpp
@@ -30,9 +30,10 @@ FIR::FIR(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( m_coefflen*sizeof(Real_type) +
-                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( m_coefflen*sizeof(Real_type) + // coeff
+                      1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) ); // in
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // out
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize());
 
diff --git a/src/apps/INTSC_HEXHEX.cpp b/src/apps/INTSC_HEXHEX.cpp
index 5520caaca..20c99a359 100644
--- a/src/apps/INTSC_HEXHEX.cpp
+++ b/src/apps/INTSC_HEXHEX.cpp
@@ -65,6 +65,7 @@ INTSC_HEXHEX::INTSC_HEXHEX(const RunParams& params)
   //   A standard intersection is 8 subzone intersections.
   //
   setBytesWrittenPerRep( 13*8*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 336 ;
diff --git a/src/apps/INTSC_HEXRECT.cpp b/src/apps/INTSC_HEXRECT.cpp
index 200676617..221ef15c5 100644
--- a/src/apps/INTSC_HEXRECT.cpp
+++ b/src/apps/INTSC_HEXRECT.cpp
@@ -73,6 +73,7 @@ INTSC_HEXRECT::INTSC_HEXRECT(const RunParams& params)
 
   // Bytes written : nvals_hexrect (=4) doubles for each intersection.
   setBytesWrittenPerRep( nvals_hexrect*sizeof(Real_type) * getItsPerRep() );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   constexpr Size_type flops_per_tri = 150 ;
diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
index cf7266688..f48fe5fc5 100644
--- a/src/apps/LTIMES.cpp
+++ b/src/apps/LTIMES.cpp
@@ -42,10 +42,10 @@ LTIMES::LTIMES(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
index a06c930a5..8d49d60e5 100644
--- a/src/apps/LTIMES_NOVIEW.cpp
+++ b/src/apps/LTIMES_NOVIEW.cpp
@@ -42,10 +42,10 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
   setItsPerRep( m_philen );
   setKernelsPerRep(1);
   // using total data size instead of writes and reads
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_philen +
-                      1*sizeof(Real_type) * m_elllen +
-                      1*sizeof(Real_type) * m_psilen );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_elllen + // ell
+                      1*sizeof(Real_type) * m_psilen ); // psi
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * m_philen ); // phi
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d);
 
diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp
index 2b2cc3e73..b769b8618 100644
--- a/src/apps/MASS3DEA.cpp
+++ b/src/apps/MASS3DEA.cpp
@@ -40,6 +40,7 @@ MASS3DEA::MASS3DEA(const RunParams& params)
   setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B
                       1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D
   setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * 7 * ea_mat_entries);
diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp
index 7d75e6bf6..35927cde0 100644
--- a/src/apps/MASS3DPA.cpp
+++ b/src/apps/MASS3DPA.cpp
@@ -36,9 +36,10 @@ MASS3DPA::MASS3DPA(const RunParams& params)
   setKernelsPerRep(1);
 
   setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt
-                      2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y
+                      1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X
                       1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D +
diff --git a/src/apps/MASSVEC3DPA.cpp b/src/apps/MASSVEC3DPA.cpp
index f5191cb53..13d47113d 100644
--- a/src/apps/MASSVEC3DPA.cpp
+++ b/src/apps/MASSVEC3DPA.cpp
@@ -43,6 +43,7 @@ MASSVEC3DPA::MASSVEC3DPA(const RunParams &params)
                          m_NE); // D
   setBytesWrittenPerRep(3 * sizeof(Real_type) * MVPA_D1D * MVPA_D1D * MVPA_D1D *
                         MVPA_DIM * m_NE); // Y (3 components)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep(0);
 
   //3 for the dimension loop
diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp
index d06d7fe39..db2876be3 100644
--- a/src/apps/MATVEC_3D_STENCIL.cpp
+++ b/src/apps/MATVEC_3D_STENCIL.cpp
@@ -69,10 +69,11 @@ MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params)
                             get_size_matrix(1, 1, 1) +
                             get_size_matrix(0, 1, 1) +
                             get_size_matrix(1, 1, 1) ;
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * x_accessed +
-                      1*sizeof(Real_type) * m_accessed );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * x_accessed + // x
+                      1*sizeof(Real_type) * m_accessed ); // m
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed ); // b
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   const size_t multiplies = 27;
diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp
index 9cdf5afcb..ef3a74922 100644
--- a/src/apps/NODAL_ACCUMULATION_3D.cpp
+++ b/src/apps/NODAL_ACCUMULATION_3D.cpp
@@ -39,10 +39,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * getItsPerRep() ); // vol
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes ); // x (3d nodal stencil pattern: 8 touches per iterate)
   setFLOPsPerRep(9 * getItsPerRep());
 
   checksum_scale_factor = 0.001 *
diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp
index e1df584e0..5c32afa7d 100644
--- a/src/apps/PRESSURE.cpp
+++ b/src/apps/PRESSURE.cpp
@@ -28,10 +28,11 @@ PRESSURE::PRESSURE(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-                      3*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-                         1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // bvc
+                      3*sizeof(Real_type) * getActualProblemSize() ); // bvc, e_old, vnewc
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // compression
+                         1*sizeof(Real_type) * getActualProblemSize() ); // p_new
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((2 +
                   1
diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp
index 04ab81bfb..d31a44766 100644
--- a/src/apps/VOL3D.cpp
+++ b/src/apps/VOL3D.cpp
@@ -38,8 +38,9 @@ VOL3D::VOL3D(const RunParams& params)
   setItsPerRep( m_domain->lpz+1 - m_domain->fpz );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); // x, y, z (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // vol
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz));
 
diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp
index ac90c91b9..a10a036a4 100644
--- a/src/apps/ZONAL_ACCUMULATION_3D.cpp
+++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp
@@ -39,9 +39,10 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   // touched data size, not actual number of stores and loads
-  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() +
-                      1*sizeof(Real_type) * m_domain->n_real_nodes );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() );
+  setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + // real_zones
+                      1*sizeof(Real_type) * m_domain->n_real_nodes ); // x (3d nodal stencil pattern: 8 touches per iterate)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // vol
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(8 * getItsPerRep());
 
diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp
index ad13dd38e..679c313b8 100644
--- a/src/basic/ARRAY_OF_PTRS.cpp
+++ b/src/basic/ARRAY_OF_PTRS.cpp
@@ -30,8 +30,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(m_array_size * getActualProblemSize());
 
diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp
index 5aa98f4ef..3d7ce5bff 100644
--- a/src/basic/COPY8.cpp
+++ b/src/basic/COPY8.cpp
@@ -28,8 +28,9 @@ COPY8::COPY8(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); // x0, x1, x2, x3, x4, x5, x6, x7
+  setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); // y0, y1, y2, y3, y4, y5, y6, y7
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp
index d4aef4aed..ebb387b48 100644
--- a/src/basic/DAXPY.cpp
+++ b/src/basic/DAXPY.cpp
@@ -28,8 +28,9 @@ DAXPY::DAXPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp
index d1c914c62..7bfd87980 100644
--- a/src/basic/DAXPY_ATOMIC.cpp
+++ b/src/basic/DAXPY_ATOMIC.cpp
@@ -28,9 +28,10 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
   setBytesWrittenPerRep( 0 );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // y
   setFLOPsPerRep(2 * getActualProblemSize());
 
   setComplexity(Complexity::N);
diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp
index 6d507c6fe..13559b4ad 100644
--- a/src/basic/EMPTY.cpp
+++ b/src/basic/EMPTY.cpp
@@ -30,6 +30,7 @@ EMPTY::EMPTY(const RunParams& params)
   setKernelsPerRep( 1 );
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 0 );
 
diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp
index a6a9ae34d..81f314767 100644
--- a/src/basic/IF_QUAD.cpp
+++ b/src/basic/IF_QUAD.cpp
@@ -28,8 +28,9 @@ IF_QUAD::IF_QUAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); // b, a, c
+  setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x2, x1
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   // estimate conditional true half of the time, 1 sqrt
   setFLOPsPerRep(4 * getActualProblemSize() +
diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp
index fc9513ddb..a9ea6704d 100644
--- a/src/basic/INDEXLIST.cpp
+++ b/src/basic/INDEXLIST.cpp
@@ -28,10 +28,9 @@ INDEXLIST::INDEXLIST(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) +
-                      1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) +
-                         1*sizeof(Int_type) * getActualProblemSize() / 2 ); // about 50% output
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesWrittenPerRep( 1*sizeof(Int_type) * getActualProblemSize() / 2 ); // list (about 50% output)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp
index 38d691a76..b46573f09 100644
--- a/src/basic/INDEXLIST_3LOOP.cpp
+++ b/src/basic/INDEXLIST_3LOOP.cpp
@@ -28,18 +28,15 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params)
 
   setItsPerRep( 3 * getActualProblemSize() + 1 );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() +
-
-                      1*sizeof(Index_type) +
-                      1*sizeof(Index_type) * (getActualProblemSize()+1) +
-
-                      1*sizeof(Index_type) * (getActualProblemSize()+1) );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() +
-
-                         1*sizeof(Index_type) +
-                         1*sizeof(Index_type) * (getActualProblemSize()+1) +
-
-                         1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // about 50% output
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + // x
+                      0 +
+                      1*sizeof(Index_type) * (getActualProblemSize()+1) ); // counts
+  setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() + // counts
+                         0 +
+                         1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // list (about 50% output)
+  setBytesModifyWrittenPerRep( 0 +
+                               1*sizeof(Index_type) * (getActualProblemSize()+1) + // counts
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp
index 89b9ac2e9..17988f502 100644
--- a/src/basic/INIT3.cpp
+++ b/src/basic/INIT3.cpp
@@ -28,8 +28,9 @@ INIT3::INIT3(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // in1, in2
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  ); // out3, out2, out1
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp
index ec9c86507..3d3cc80a2 100644
--- a/src/basic/INIT_VIEW1D.cpp
+++ b/src/basic/INIT_VIEW1D.cpp
@@ -29,7 +29,8 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp
index ba029398c..568468906 100644
--- a/src/basic/INIT_VIEW1D_OFFSET.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET.cpp
@@ -29,7 +29,8 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp
index 2d2211d8f..9feaaa094 100644
--- a/src/basic/MAT_MAT_SHARED.cpp
+++ b/src/basic/MAT_MAT_SHARED.cpp
@@ -32,8 +32,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams &params)
   setItsPerRep( num_tiles*num_tiles * TL_SZ*TL_SZ );
   setKernelsPerRep(1);
 
-  setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N ); // A, B
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N  ); // C
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   setFLOPsPerRep(2 * TL_SZ * TL_SZ * TL_SZ * num_tiles * num_tiles * num_tiles);
diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp
index 139efcb15..05f1bef52 100644
--- a/src/basic/MULADDSUB.cpp
+++ b/src/basic/MULADDSUB.cpp
@@ -28,8 +28,9 @@ MULADDSUB::MULADDSUB(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // in1, in2
+  setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize()  ); // out1, out2, out3
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp
index 391030d32..c0ceb233b 100644
--- a/src/basic/MULTI_REDUCE.cpp
+++ b/src/basic/MULTI_REDUCE.cpp
@@ -34,11 +34,11 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins +
-                      1*sizeof(Data_type) * getActualProblemSize() +
-                      1*sizeof(Index_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins );
-  setBytesAtomicModifyWrittenPerRep( 0 );
+  setBytesReadPerRep( 1*sizeof(Data_type) * getActualProblemSize() + // bins
+                      1*sizeof(Index_type) * getActualProblemSize() ); // data
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); // values
   setFLOPsPerRep(1 * getActualProblemSize());
 
   setComplexity(Complexity::N);
diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp
index 8251b5be7..d3ef58491 100644
--- a/src/basic/NESTED_INIT.cpp
+++ b/src/basic/NESTED_INIT.cpp
@@ -40,7 +40,8 @@ NESTED_INIT::NESTED_INIT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize()  );
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // array
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * getActualProblemSize());
 
diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp
index a9c57117d..a227a5fd6 100644
--- a/src/basic/PI_ATOMIC.cpp
+++ b/src/basic/PI_ATOMIC.cpp
@@ -30,7 +30,8 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params)
   setKernelsPerRep(1);
   setBytesReadPerRep( 0 );
   setBytesWrittenPerRep( 0  );
-  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesModifyWrittenPerRep( 0 );
+  setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) ); // pi
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
   setComplexity(Complexity::N);
diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp
index 9e9e2866f..de00c4223 100644
--- a/src/basic/PI_REDUCE.cpp
+++ b/src/basic/PI_REDUCE.cpp
@@ -28,8 +28,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(6 * getActualProblemSize() + 1);
 
diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp
index fef0af251..b1ff14a04 100644
--- a/src/basic/REDUCE3_INT.cpp
+++ b/src/basic/REDUCE3_INT.cpp
@@ -33,9 +33,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Int_type) +
-                      1*sizeof(Int_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 3*sizeof(Int_type) );
+  setBytesReadPerRep( 1*sizeof(Int_type) * getActualProblemSize() ); // vec
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp
index 180f29f20..83cd93ed4 100644
--- a/src/basic/REDUCE_STRUCT.cpp
+++ b/src/basic/REDUCE_STRUCT.cpp
@@ -33,9 +33,9 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 6*sizeof(Real_type) +
-                      2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 6*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // x, y
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize() + 2);
 
diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp
index dd0347fdb..857f87253 100644
--- a/src/basic/TRAP_INT.cpp
+++ b/src/basic/TRAP_INT.cpp
@@ -28,8 +28,9 @@ TRAP_INT::TRAP_INT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 0 );
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt
 
diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp
index 2f2910e46..dd91a340c 100644
--- a/src/comm/HALO_EXCHANGE.cpp
+++ b/src/comm/HALO_EXCHANGE.cpp
@@ -32,18 +32,19 @@ HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // send
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // (pack|send)_buffers (MPI)
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size +  // recv
+                         1*sizeof(Real_type) * m_num_vars * halo_size +  // (recv|unpack)_buffers (MPI)
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp
index c121deac8..b63b0bb59 100644
--- a/src/comm/HALO_EXCHANGE_FUSED.cpp
+++ b/src/comm/HALO_EXCHANGE_FUSED.cpp
@@ -32,18 +32,19 @@ HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // send
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // (pack|send)_buffers (MPI)
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size +  // recv
+                         1*sizeof(Real_type) * m_num_vars * halo_size +  // (recv|unpack)_buffers (MPI)
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp
index a0406e693..8ee54c176 100644
--- a/src/comm/HALO_PACKING.cpp
+++ b/src/comm/HALO_PACKING.cpp
@@ -26,14 +26,15 @@ HALO_PACKING::HALO_PACKING(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 * s_num_neighbors * m_num_vars );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp
index 8ab60d787..16e20cd11 100644
--- a/src/comm/HALO_PACKING_FUSED.cpp
+++ b/src/comm/HALO_PACKING_FUSED.cpp
@@ -26,14 +26,15 @@ HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params)
 
   setItsPerRep( 2 * m_num_vars * halo_size );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack
-                      1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+  setBytesReadPerRep( 1*sizeof(Int_type) * m_num_vars * halo_size +   // pack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size +  // vars
 
-                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack
-                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack
+                      1*sizeof(Int_type) * m_num_vars * halo_size +   // unpack_index_lists
+                      1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack_buffers
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size +  // pack_buffers
 
-                         1*sizeof(Real_type) * m_num_vars * halo_size ); // unpack
+                         1*sizeof(Real_type) * m_num_vars * halo_size ); // vars
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp
index ca2975b06..9d1fd95fa 100644
--- a/src/comm/HALO_SENDRECV.cpp
+++ b/src/comm/HALO_SENDRECV.cpp
@@ -32,8 +32,9 @@ HALO_SENDRECV::HALO_SENDRECV(const RunParams& params)
 
   setItsPerRep( 0 );
   setKernelsPerRep( 0 );
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // send
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // recv
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // send_buffers (MPI)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * m_num_vars * halo_size ); // recv_buffers (MPI)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp
index f8b681430..68a21ea69 100644
--- a/src/lcals/DIFF_PREDICT.cpp
+++ b/src/lcals/DIFF_PREDICT.cpp
@@ -28,8 +28,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
 
   setKernelsPerRep(1);
-  setBytesReadPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 10*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // cx(4)
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // px(13)
+  setBytesModifyWrittenPerRep( 9*sizeof(Real_type) * getActualProblemSize() ); // px(4), px(5), px(6), px(7), px(8), px(9), px(10), px(11), px(12)
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(9 * getActualProblemSize());
 
diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp
index 8394e00a5..8b92dea36 100644
--- a/src/lcals/EOS.cpp
+++ b/src/lcals/EOS.cpp
@@ -31,9 +31,10 @@ EOS::EOS(const RunParams& params)
   setItsPerRep( getActualProblemSize() );
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() +
-                      1*sizeof(Real_type) * m_array_length );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() + // z, y
+                      1*sizeof(Real_type) * m_array_length ); // u (each iterate accesses the range [i, i+6])
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(16 * getActualProblemSize());
 
diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp
index d9da1de3e..46047c3fd 100644
--- a/src/lcals/FIRST_DIFF.cpp
+++ b/src/lcals/FIRST_DIFF.cpp
@@ -30,8 +30,9 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * m_N ); // y (each iterate accesses the range [i, i+1])
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp
index 973d50598..f8c244cb5 100644
--- a/src/lcals/FIRST_MIN.cpp
+++ b/src/lcals/FIRST_MIN.cpp
@@ -33,11 +33,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Index_type) +
-                      1*sizeof(Real_type ) +
-                      1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Index_type) +
-                         1*sizeof(Real_type ) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N ); // x
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp
index 948ed213d..17df7ed47 100644
--- a/src/lcals/FIRST_SUM.cpp
+++ b/src/lcals/FIRST_SUM.cpp
@@ -30,8 +30,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params)
 
   setItsPerRep( m_N-1 );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N ); // y (each iterate accesses the range [i-1, i])
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * (m_N-1));
 
diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp
index 7ddfad28a..0f2512d6d 100644
--- a/src/lcals/GEN_LIN_RECUR.cpp
+++ b/src/lcals/GEN_LIN_RECUR.cpp
@@ -30,10 +30,12 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params)
 
   setItsPerRep( 2 * m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * m_N +
-                      3*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
-                         2*sizeof(Real_type ) * m_N );
+  setBytesReadPerRep( 2*sizeof(Real_type ) * m_N + // sa, sb
+                      2*sizeof(Real_type ) * m_N ); // sa, sb
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + // b5
+                         1*sizeof(Real_type ) * m_N ); // b5
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type ) * m_N + // stb5
+                               1*sizeof(Real_type ) * m_N ); // stb5
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((3 +
                   3 ) * m_N);
diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp
index 9be3fcd60..82e0fec46 100644
--- a/src/lcals/HYDRO_1D.cpp
+++ b/src/lcals/HYDRO_1D.cpp
@@ -30,9 +30,10 @@ HYDRO_1D::HYDRO_1D(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() +
-                      1*sizeof(Real_type ) * (getActualProblemSize()+1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() + // y
+                      1*sizeof(Real_type ) * (getActualProblemSize()+1) ); // z (each iterate accesses the range [i+10, i+11])
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // x
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(5 * getActualProblemSize());
 
diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp
index 7a1881737..f8ffa2dd4 100644
--- a/src/lcals/HYDRO_2D.cpp
+++ b/src/lcals/HYDRO_2D.cpp
@@ -40,18 +40,23 @@ HYDRO_2D::HYDRO_2D(const RunParams& params)
 
   setItsPerRep( 3 * (m_kn-2) * (m_jn-2) );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 4*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) +
+  setBytesReadPerRep( 2*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) + // zp, zq (4 point stencil)
+                      2*sizeof(Real_type ) * ((m_kn-1) * (m_jn-1) - 1) + // zr, zm (3 point stencil)
 
-                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
-                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-1) +
-                      2*sizeof(Real_type ) * ((m_kn) * (m_jn) - 4) +
+                      2*sizeof(Real_type ) * (m_kn-2) * (m_jn-1) + // za, zb (2 point stencil)
+                      2*sizeof(Real_type ) * ((m_kn) * (m_jn) - 4) + // zz, zr (5 point stencil)
 
-                      4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                      4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); // zr, zu, zz, zv
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + // za, zb
 
-                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) +
+                         0 +
 
-                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) );
+                         2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); // zrout, zzout
+  setBytesModifyWrittenPerRep( 0 +
+
+                               2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + // zu, zv
+
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((14 +
                   26 +
diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp
index f95dee3cf..2a7620200 100644
--- a/src/lcals/INT_PREDICT.cpp
+++ b/src/lcals/INT_PREDICT.cpp
@@ -28,8 +28,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() ); // px(12), px(11), px(10), px(9), px(8), px(7), px(6), px(4), px(5), px(2)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // px(0)
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(17 * getActualProblemSize());
 
diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp
index 8c6a30369..64b42f063 100644
--- a/src/lcals/PLANCKIAN.cpp
+++ b/src/lcals/PLANCKIAN.cpp
@@ -28,8 +28,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() ); // u, v, x
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() ); // y, w
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp
 
diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp
index 1ffb776ac..a06727a3e 100644
--- a/src/lcals/TRIDIAG_ELIM.cpp
+++ b/src/lcals/TRIDIAG_ELIM.cpp
@@ -30,8 +30,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params)
 
   setItsPerRep( m_N-1 );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) );
+  setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) ); // z, y, xin
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); // xout
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * (m_N-1));
 
diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp
index 0b90664d7..a458f2414 100644
--- a/src/polybench/POLYBENCH_2MM.cpp
+++ b/src/polybench/POLYBENCH_2MM.cpp
@@ -44,14 +44,15 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_ni*m_nl );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk + // B
 
-                      1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_nj * m_nl );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_ni * m_nj + // tmp
+                      1*sizeof(Real_type ) * m_nj * m_nl ); // C
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + // tmp
 
-                         1*sizeof(Real_type ) * m_ni * m_nl );
+                         1*sizeof(Real_type ) * m_ni * m_nl ); // D
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(3 * m_ni*m_nj*m_nk +
                  2 * m_ni*m_nj*m_nl );
diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp
index fe4214c3f..557c2905a 100644
--- a/src/polybench/POLYBENCH_3MM.cpp
+++ b/src/polybench/POLYBENCH_3MM.cpp
@@ -47,19 +47,20 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
 
   setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl );
   setKernelsPerRep(3);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk + // B
 
-                      1*sizeof(Real_type ) * m_nj * m_nm +
-                      1*sizeof(Real_type ) * m_nl * m_nm +
+                      1*sizeof(Real_type ) * m_nj * m_nm + // C
+                      1*sizeof(Real_type ) * m_nl * m_nm + // D
 
-                      1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_nj * m_nl );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
+                      1*sizeof(Real_type ) * m_ni * m_nj + // E
+                      1*sizeof(Real_type ) * m_nj * m_nl ); // F
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + // E
 
-                         1*sizeof(Real_type ) * m_nj * m_nl +
+                         1*sizeof(Real_type ) * m_nj * m_nl + // F
 
-                         1*sizeof(Real_type ) * m_ni * m_nl );
+                         1*sizeof(Real_type ) * m_ni * m_nl ); // G
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_ni*m_nj*m_nk +
                  2 * m_nj*m_nl*m_nm +
diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp
index d21eb7f79..2ac33591a 100644
--- a/src/polybench/POLYBENCH_ADI.cpp
+++ b/src/polybench/POLYBENCH_ADI.cpp
@@ -34,16 +34,17 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params)
   setActualProblemSize( (m_n-2) * (m_n-2) );
 
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
-                      2*sizeof(Real_type ) * (m_n-2) * (m_n-2) +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_n-2) * (m_n  ) + // u
 
-                      1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
-                      2*sizeof(Real_type ) * (m_n-2) * (m_n-2) );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (m_n-1) +
-                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) +
+                      1*sizeof(Real_type ) * (m_n-2) * (m_n  ) ); // v
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (    1) + // p, q
+                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) + // v
 
-                         2*sizeof(Real_type ) * (m_n-2) * (m_n-1) +
-                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) );
+                         2*sizeof(Real_type ) * (m_n-2) * (    1) + // p, q
+                         1*sizeof(Real_type ) * (m_n-2) * (m_n  ) ); // u
+  setBytesModifyWrittenPerRep( 2*sizeof(Real_type ) * (m_n-2) * (m_n-2) + // p, q
+
+                               2*sizeof(Real_type ) * (m_n-2) * (m_n-2) ); // p, q
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( (13 + 2) * (m_n-2)*(m_n-2) +
                   (13 + 2) * (m_n-2)*(m_n-2) );
diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp
index d82c66403..568fadd8e 100644
--- a/src/polybench/POLYBENCH_ATAX.cpp
+++ b/src/polybench/POLYBENCH_ATAX.cpp
@@ -33,13 +33,17 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params)
 
   setItsPerRep( 2 * m_N + m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +       // x
+                      1*sizeof(Real_type ) * m_N * m_N + // A
 
-                      2*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N +
-                         1*sizeof(Real_type ) * m_N);
+                      1*sizeof(Real_type ) * m_N +        // tmp
+                      1*sizeof(Real_type ) * m_N * m_N ); // A
+  setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + // y, tmp
+
+                         0);
+  setBytesModifyWrittenPerRep( 0 +
+
+                               1*sizeof(Real_type ) * m_N ); // y
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp
index 4c44a35d9..f9d89f804 100644
--- a/src/polybench/POLYBENCH_FDTD_2D.cpp
+++ b/src/polybench/POLYBENCH_FDTD_2D.cpp
@@ -43,24 +43,28 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params)
                 m_nx*(m_ny-1) +
                 (m_nx-1)*(m_ny-1) );
   setKernelsPerRep(4);
-  setBytesReadPerRep( 1*sizeof(Real_type ) +
+  setBytesReadPerRep( 1*sizeof(Real_type ) + // fict
 
-                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
-                      1*sizeof(Real_type ) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * m_nx * m_ny + // hz
 
-                      1*sizeof(Real_type ) * m_nx * (m_ny-1) +
-                      1*sizeof(Real_type ) * m_nx * m_ny +
+                      1*sizeof(Real_type ) * m_nx * m_ny + // hz
 
-                      1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) +
-                      1*sizeof(Real_type ) * (m_nx-1) * m_ny +
-                      1*sizeof(Real_type ) * m_nx * (m_ny-1) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ny +
+                      1*sizeof(Real_type ) * (m_nx-1) * m_ny + // ex
+                      1*sizeof(Real_type ) * m_nx * (m_ny-1) ); // ey
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ny + // ey
 
-                         1*sizeof(Real_type ) * (m_nx-1) * m_ny +
+                         0 +
 
-                         1*sizeof(Real_type ) * m_nx * (m_ny-1) +
+                         0 +
 
-                         1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) );
+                         0 ); // hz
+  setBytesModifyWrittenPerRep( 0 +
+
+                               1*sizeof(Real_type ) * (m_nx-1) * m_ny + // ey
+
+                               1*sizeof(Real_type ) * m_nx * (m_ny-1) + // ex
+
+                               1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) ); // hz
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 0 * m_ny +
                   3 * (m_nx-1)*m_ny +
diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
index 65ea833ca..31399e04c 100644
--- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
+++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp
@@ -33,8 +33,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params)
 
   setItsPerRep( m_N * m_N*m_N );
   setKernelsPerRep(m_N);
-  setBytesReadPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N );
+  setBytesReadPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N ); // pin
+  setBytesWrittenPerRep( m_N * 1*sizeof(Real_type ) * m_N * m_N ); // pout
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( m_N*m_N*m_N * 3 / 2 ); // conditional is true about half of the time
 
diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp
index 797754d7e..f307a1a9c 100644
--- a/src/polybench/POLYBENCH_GEMM.cpp
+++ b/src/polybench/POLYBENCH_GEMM.cpp
@@ -40,10 +40,10 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params)
 
   setItsPerRep( m_ni * m_nj );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nj +
-                      1*sizeof(Real_type ) * m_ni * m_nk +
-                      1*sizeof(Real_type ) * m_nj * m_nk );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj);
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + // A
+                      1*sizeof(Real_type ) * m_nj * m_nk ); // B
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj); // C
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((1 +
                   3 * m_nk) * m_ni*m_nj);
diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp
index 8a8555a22..5bf405456 100644
--- a/src/polybench/POLYBENCH_GEMVER.cpp
+++ b/src/polybench/POLYBENCH_GEMVER.cpp
@@ -39,23 +39,29 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params)
                 m_n +
                 m_n );
   setKernelsPerRep(4);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_n * m_n +
-                      4*sizeof(Real_type ) * m_n +
+  setBytesReadPerRep( 4*sizeof(Real_type ) * m_n + // u1, v1, u2, v2
 
-                      1*sizeof(Real_type ) * m_n * m_n +
-                      1*sizeof(Real_type ) * m_n +
+                      1*sizeof(Real_type ) * m_n * m_n + // A
+                      1*sizeof(Real_type ) * m_n + // y
 
-                      2*sizeof(Real_type ) * m_n +
+                      1*sizeof(Real_type ) * m_n + // z
 
-                      1*sizeof(Real_type ) * m_n * m_n +
-                      2*sizeof(Real_type ) * m_n );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n +
+                      1*sizeof(Real_type ) * m_n * m_n + // A
+                      2*sizeof(Real_type ) * m_n ); // x
+  setBytesWrittenPerRep( 0 +
 
-                         1*sizeof(Real_type ) * m_n +
+                         1*sizeof(Real_type ) * m_n + // x
 
-                         1*sizeof(Real_type ) * m_n +
+                         0 +
 
-                         1*sizeof(Real_type ) * m_n );
+                         1*sizeof(Real_type ) * m_n ); // w
+  setBytesModifyWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n + // A
+
+                               0 +
+
+                               1*sizeof(Real_type ) * m_n + // x
+
+                               0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(4 * m_n*m_n +
                  3 * m_n*m_n +
diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp
index 2e8cec776..922fc14ec 100644
--- a/src/polybench/POLYBENCH_GESUMMV.cpp
+++ b/src/polybench/POLYBENCH_GESUMMV.cpp
@@ -36,9 +36,10 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params)
 
   setItsPerRep( m_N );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      2*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // x
+                      2*sizeof(Real_type ) * m_N * m_N ); // A, B
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); // y
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep((4 * m_N +
                   3 ) * m_N  );
diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp
index cb197ebc5..2b771399c 100644
--- a/src/polybench/POLYBENCH_HEAT_3D.cpp
+++ b/src/polybench/POLYBENCH_HEAT_3D.cpp
@@ -34,10 +34,13 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep( 2 );
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) +
-                      1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8));
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) + // A (7 point stencil)
+
+                      1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8)); // B (7 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 15 * (m_N-2) * (m_N-2) * (m_N-2) +
                   15 * (m_N-2) * (m_N-2) * (m_N-2) );
diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp
index 1e781adbd..dc9b1b139 100644
--- a/src/polybench/POLYBENCH_JACOBI_1D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp
@@ -33,10 +33,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params)
 
   setItsPerRep( 2 * getActualProblemSize() );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // A (3 point stencil)
+
+                      1*sizeof(Real_type ) * m_N ); // B (3 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 3 * (m_N-2) +
                   3 * (m_N-2) );
diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp
index ce0fe9e3d..583eb8481 100644
--- a/src/polybench/POLYBENCH_JACOBI_2D.cpp
+++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp
@@ -33,10 +33,13 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params)
 
   setItsPerRep( 2 * (m_N-2) * (m_N-2) );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N - 4) +
-                      1*sizeof(Real_type ) * (m_N * m_N - 4) );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) +
-                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) );
+  setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N * m_N - 4) + // A (5 point stencil)
+
+                      1*sizeof(Real_type ) * (m_N * m_N - 4) ); // B (5 point stencil)
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) + // B
+
+                         1*sizeof(Real_type ) * (m_N-2) * (m_N-2) ); // A
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep( 5 * (m_N-2)*(m_N-2) +
                   5 * (m_N-2)*(m_N-2) );
diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp
index e095fdfac..49759868a 100644
--- a/src/polybench/POLYBENCH_MVT.cpp
+++ b/src/polybench/POLYBENCH_MVT.cpp
@@ -33,14 +33,15 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params)
 
   setItsPerRep( 2 * m_N );
   setKernelsPerRep(2);
-  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N +
+  setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + // y1
+                      1*sizeof(Real_type ) * m_N * m_N + // A
 
-                      1*sizeof(Real_type ) * m_N +
-                      1*sizeof(Real_type ) * m_N * m_N );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N +
+                      1*sizeof(Real_type ) * m_N + // y2
+                      1*sizeof(Real_type ) * m_N * m_N ); // A
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + // x1
 
-                         1*sizeof(Real_type ) * m_N );
+                         1*sizeof(Real_type ) * m_N ); // x2
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * m_N*m_N +
                  2 * m_N*m_N );
diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp
index ba5837344..3f9a4be00 100644
--- a/src/stream/ADD.cpp
+++ b/src/stream/ADD.cpp
@@ -28,8 +28,9 @@ ADD::ADD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // a, b
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // c
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp
index 63bd60320..819d19e51 100644
--- a/src/stream/COPY.cpp
+++ b/src/stream/COPY.cpp
@@ -28,8 +28,9 @@ COPY::COPY(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); // c
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(0);
 
diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp
index 643ee461a..32577712b 100644
--- a/src/stream/DOT.cpp
+++ b/src/stream/DOT.cpp
@@ -28,9 +28,9 @@ DOT::DOT(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) +
-                      2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // a, b
+  setBytesWrittenPerRep( 0 );
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 
diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp
index 8243d80e5..9032ae6ca 100644
--- a/src/stream/MUL.cpp
+++ b/src/stream/MUL.cpp
@@ -28,8 +28,9 @@ MUL::MUL(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // c
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // b
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(1 * getActualProblemSize());
 
diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp
index 6f26adee3..ab68661e8 100644
--- a/src/stream/TRIAD.cpp
+++ b/src/stream/TRIAD.cpp
@@ -28,8 +28,9 @@ TRIAD::TRIAD(const RunParams& params)
 
   setItsPerRep( getActualProblemSize() );
   setKernelsPerRep(1);
-  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() );
-  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() );
+  setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // b, c
+  setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // a
+  setBytesModifyWrittenPerRep( 0 );
   setBytesAtomicModifyWrittenPerRep( 0 );
   setFLOPsPerRep(2 * getActualProblemSize());
 

From d3e13284b9b8d8e0deb0ac749c95bf88dd7e13d7 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 17 Dec 2025 10:58:07 -0800
Subject: [PATCH 3/5] Update docs

---
 docs/sphinx/dev_guide/kernel_class_impl.rst | 20 ++++++++++++++++++--
 docs/sphinx/user_guide/output.rst           | 18 +++++++++++++++++-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst
index 79ea7b23c..a1e9377c7 100644
--- a/docs/sphinx/dev_guide/kernel_class_impl.rst
+++ b/docs/sphinx/dev_guide/kernel_class_impl.rst
@@ -50,8 +50,13 @@ The methods in the source file are:
         is the problem size and the number of loop kernels is one. Other kernels
         in the Suite may execute multiple loop kernels with different sizes,
         so these methods are used to describe this.
-      * The number of bytes read and written and the number of floating point
-        operations (FLOPS) performed for each kernel execution.
+      * The number of bytes read for each kernel execution.
+      * The number of bytes written for each kernel execution.
+      * The number of bytes read, modified, and written for each kernel execution.
+      * The number of bytes atomically read, modified, and written for each
+        kernel execution.
+      * The number of floating point operations (FLOPS) performed for each
+        kernel execution.
       * The operational complexity of the kernel.
       * Which RAJA features the kernel exercises.
       * Adding Suite variants and tunings via ``addVariantTunings``. This calls
@@ -60,6 +65,17 @@ The methods in the source file are:
         not every kernel implements every variant, so ``KernelBase`` provides a
         "default" implementation that defines no variants or tunings.
 
+    ..note:: When counting the number of bytes accessed each byte accessed is
+             counted in exactly one of the byte counters. For example a byte
+             that read and written is counted in the bytes read, modified, and
+             written counter and not in the read or written counters. Each byte
+             accessed is counted once for each loop/kernel launch it is accessed
+             in even if it is accessed multiple times within a loop/kernel
+             launch. For example if a byte is written five times in the first
+             loop/kernel launch and read two times in the second loop/kernel
+             launch of a kernel, then it is counted once in the bytes read
+             counter and once in the bytes written counter.
+
     ..note:: Available variant tunings for each kernel are specified using a
              ``...BOILERPLATE...`` macro invocation in each kernel variant
              source file. This is discussed in :ref:`kernel_class_impl_exec-label`.
diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index 522f220a8..88cd7d180 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -80,21 +80,37 @@ Information reported in the file for each kernel is:
   * **Kernels/rep** -- total number of loop structures run (or GPU kernels 
     launched) in each kernel repetition.
   * **Bytes/rep** -- Total number of bytes read from and written to memory for 
-    each repetition of kernel.
+    each repetition of kernel. This is a best case scenario of the total traffic
+    to and from memory assuming perfect cache reuse and ignoring partial usage
+    of data in some memory transactions.
   * **FLOPs/rep** -- Total number of floating point operations executed for 
     each repetition of kernel. Currently, we count arithmetic operations 
     (+, -, *, /) and functions, such as exp, sin, etc. as one FLOP. We do not 
     currently count operations like abs and comparisons (<, >, etc.) in the 
     FLOP count. So these numbers are rough estimates. For actual FLOP counts, 
     a performance analysis tool should be used.
+  * **BytesTouched/rep** -- Total number of bytes accessed for each repetition
+    of kernel. This is a best case scenario for the amount of cache needed to
+    fit all of the data used by the kernel ignoring partial usage of some cache
+    lines.
   * **BytesRead/rep** -- Total number of bytes read from memory for 
     each repetition of kernel.
   * **BytesWritten/rep** -- Total number of bytes written to memory for 
     each repetition of kernel.
+  * **BytesModifyWritten/rep** -- Total number of bytes modified (read and
+    written) for each repetition of kernel.
   * **BytesAtomicModifyWritten/rep** -- Total number of bytes modified by 
     atomic operations in a kernel. If a kernel contains no atomic operations, 
     the value of zero is reported.
 
+  ..note:: The BytesRead/rep, BytesWritten/rep, BytesModifyWritten/rep, and
+           BytesAtomicModifyWritten/rep counts are mutually exclusive, so
+           access(es) to a memory location in a loop is only counted in one of
+           these counts.
+
+  ..note:: The Bytes*/rep and FLOPs/rep counts are estimates for kernels
+           involving randomness or difficult to count algorithms.
+
 .. _output_probsize-label:
 
 ============================

From 2027c3d2e5b105f09c506546ad5d26b00cadaf11 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 18 Dec 2025 11:58:24 -0800
Subject: [PATCH 4/5] Count RMW in R and W output

Undo separate of read modify write count from the read count and
write count. So the BytesRead/rep count includes both those
"read only" and "read modified and written".
---
 docs/sphinx/user_guide/output.rst | 10 +++-------
 src/common/KernelBase.hpp         |  4 ++--
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index 88cd7d180..3fee5cf32 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -97,17 +97,13 @@ Information reported in the file for each kernel is:
     each repetition of kernel.
   * **BytesWritten/rep** -- Total number of bytes written to memory for 
     each repetition of kernel.
-  * **BytesModifyWritten/rep** -- Total number of bytes modified (read and
-    written) for each repetition of kernel.
+  * **BytesModifyWritten/rep** -- Total number of bytes modified for each
+    repetition of kernel. The intersection of bytes in both ``BytesRead/rep``
+    and ``BytesWritten/rep``.
   * **BytesAtomicModifyWritten/rep** -- Total number of bytes modified by 
     atomic operations in a kernel. If a kernel contains no atomic operations, 
     the value of zero is reported.
 
-  ..note:: The BytesRead/rep, BytesWritten/rep, BytesModifyWritten/rep, and
-           BytesAtomicModifyWritten/rep counts are mutually exclusive, so
-           access(es) to a memory location in a loop is only counted in one of
-           these counts.
-
   ..note:: The Bytes*/rep and FLOPs/rep counts are estimates for kernels
            involving randomness or difficult to count algorithms.
 
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 64021d887..a63b3d063 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -193,8 +193,8 @@ class KernelBase
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
   Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_modify_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count modify_write operations twice to get the memory traffic
   Index_type getBytesTouchedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + bytes_modify_written_per_rep + bytes_atomic_modify_written_per_rep; } // count modify_write operations once to get the data size only
-  Index_type getBytesReadPerRep() const { return bytes_read_per_rep; }
-  Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; }
+  Index_type getBytesReadPerRep() const { return bytes_read_per_rep + bytes_modify_written_per_rep; }
+  Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep + bytes_modify_written_per_rep; }
   Index_type getBytesModifyWrittenPerRep() const { return bytes_modify_written_per_rep; }
   Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }

From 64285ef1e1a496fdff43b751f3d7f4da903d3264 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 18 Dec 2025 11:58:41 -0800
Subject: [PATCH 5/5] Improve documentation about how to count bytes

---
 docs/sphinx/dev_guide/kernel_class_impl.rst | 25 ++++++++++++---------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst
index a1e9377c7..becc2c617 100644
--- a/docs/sphinx/dev_guide/kernel_class_impl.rst
+++ b/docs/sphinx/dev_guide/kernel_class_impl.rst
@@ -65,16 +65,21 @@ The methods in the source file are:
         not every kernel implements every variant, so ``KernelBase`` provides a
         "default" implementation that defines no variants or tunings.
 
-    ..note:: When counting the number of bytes accessed each byte accessed is
-             counted in exactly one of the byte counters. For example a byte
-             that read and written is counted in the bytes read, modified, and
-             written counter and not in the read or written counters. Each byte
-             accessed is counted once for each loop/kernel launch it is accessed
-             in even if it is accessed multiple times within a loop/kernel
-             launch. For example if a byte is written five times in the first
-             loop/kernel launch and read two times in the second loop/kernel
-             launch of a kernel, then it is counted once in the bytes read
-             counter and once in the bytes written counter.
+    ..note:: The byte counters are intended to count traffic to and from main
+             memory like DRAM or HBM under idealized conditions with perfect
+             caching. They are not intended to count the total number of bytes
+             requested by load and store instructions. So, even if a memory
+             address is read in multiple different iterations of a loop with a
+             stencil access pattern it is only counted once in bytes read.
+             However caching is not assumed between loops/kernel launches so an
+             address is counted once for each separate loop or kernel launch.
+
+    ..note:: To simplify counting each address accessed should only be counted
+             in one of the byte counter attributes. For example an address
+             that is read and written is counted in the "read, modified, and
+             written" counter, but not in the "read" or "written" counters. The
+             final output however does add the "read" and "read, modified, and
+             written" counters when showing the bytes read.
 
     ..note:: Available variant tunings for each kernel are specified using a
              ``...BOILERPLATE...`` macro invocation in each kernel variant