From c240b87edd402465b848c20549637ff3a3d86f53 Mon Sep 17 00:00:00 2001
From: Amalee Wilson <amalee@cs.stanford.edu>
Date: Sun, 14 Jun 2020 23:26:48 -0700
Subject: [PATCH 1/4] Added shared memory workspaces for GPUs. Functional but
 needs cleanup.

---
 include/taco/codegen/module.h                 |   3 +
 include/taco/index_notation/index_notation.h  |   4 +
 .../taco/index_notation/provenance_graph.h    |   1 +
 include/taco/index_notation/transformations.h |   1 +
 include/taco/ir/ir.h                          |   4 +-
 include/taco/tensor.h                         |   6 +
 src/codegen/codegen_cuda.cpp                  | 180 +++++++++++++++++-
 src/codegen/codegen_cuda.h                    |   5 +
 src/codegen/module.cpp                        |  45 +++++
 src/index_notation/index_notation.cpp         |  34 ++++
 src/index_notation/transformations.cpp        |  25 +++
 src/ir/ir.cpp                                 |  19 +-
 src/lower/lowerer_impl.cpp                    |  45 ++++-
 src/tensor.cpp                                |  26 +++
 test/tests-scheduling-eval.cpp                | 158 +++++++++++++++
 15 files changed, 548 insertions(+), 8 deletions(-)
diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 788156fdb..4b6aa6f37 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -25,6 +25,9 @@ class Module {
 
   /// Compile the source into a library, returning its full path
   std::string compile();
+
+  // Recompile 
+  std::string recompile(std::string file_path);
   
   /// Compile the module into a source file located at the specified location
   /// path and prefix.  The generated source will be path/prefix.{.c|.bc, .h}
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index bda31635a..04ec9c602 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -620,6 +620,7 @@ class IndexStmt : public util::IntrusivePtr<const IndexStmtNode> {
   ///  allows us to leverage scratchpad memories and
   ///  reorder computations to increase locality
   IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const;
+  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const;
 
   /// bound specifies a compile-time constraint on an index variable's
   /// iteration space that allows knowledge of the
@@ -844,6 +845,9 @@ class TensorVar : public util::Comparable<TensorVar> {
   TensorVar(const std::string& name, const Type& type);
   TensorVar(const Type& type, const Format& format);
   TensorVar(const std::string& name, const Type& type, const Format& format);
+  TensorVar(const std::string& name, const Type& type, const Format& format, bool is_shared_mem);
+
+  bool is_shared_memory();
 
   /// Returns the name of the tensor variable.
   std::string getName() const;
diff --git a/include/taco/index_notation/provenance_graph.h b/include/taco/index_notation/provenance_graph.h
index cc2b12d7b..e5f2b195e 100644
--- a/include/taco/index_notation/provenance_graph.h
+++ b/include/taco/index_notation/provenance_graph.h
@@ -226,6 +226,7 @@ bool operator==(const BoundRelNode&, const BoundRelNode&);
 /// This allows precomputeVar to be scheduled separately from the parentVar
 struct PrecomputeRelNode : public IndexVarRelNode {
   PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar);
+  PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar, bool shared_mem);
 
   const IndexVar& getParentVar() const;
   const IndexVar& getPrecomputeVar() const;
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index 3dd4fe141..df29b7feb 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -87,6 +87,7 @@ class Precompute : public TransformationInterface {
 public:
   Precompute();
   Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace);
+  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem);
 
   IndexExpr getExpr() const;
   IndexVar geti() const;
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 15dbdc7aa..56f186d66 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -254,9 +254,10 @@ struct Var : public ExprNode<Var> {
   std::string name;
   bool is_ptr;
   bool is_tensor;
+  bool is_shared_memory = false;
 
   static Expr make(std::string name, Datatype type, bool is_ptr=false, 
-                   bool is_tensor=false);
+                   bool is_tensor=false, bool is_shared_memory=false);
 
   static const IRNodeType _type_info = IRNodeType::Var;
 };
@@ -685,6 +686,7 @@ struct Allocate : public StmtNode<Allocate> {
   Expr num_elements;
   Expr old_elements; // used for realloc in CUDA
   bool is_realloc;
+  bool is_shared_memory;
   
   static Stmt make(Expr var, Expr num_elements, bool is_realloc=false,
                    Expr old_elements=Expr());
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index 05bc1773b..cab402346 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -406,12 +406,18 @@ class TensorBase {
 
   void compile(IndexStmt stmt, bool assembleWhileCompute=false);
 
+  void recompile(std::string file_path);
+
   /// Assemble the tensor storage, including index and value arrays.
   void assemble();
 
+  void reassemble();
+
   /// Compute the given expression and put the values in the tensor storage.
   void compute();
 
+  void recompute();
+
   /// Compile, assemble and compute as needed.
   void evaluate();
 
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 5eb57c7ad..b7e59f691 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   vector<pair<string, Expr>> warpIDVars;
   vector<Expr> numThreads;
   vector<Expr> numWarps;
+  vector<Expr> sizeSharedMemory;
+  std::string typeSharedMemory;
 
   CodeGen_CUDA *codeGen;
   // copy inputs and outputs into the map
@@ -359,6 +361,17 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
 
       threadFors.push_back(op);
       threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
+      // if (isa<Var>(op->var)){
+      //   std::cout << "~~~~~ Found a var! " << to<Var>(op->var)->name << std::endl;
+      //   if (to<Var>(op->var)->is_shared_memory)
+      //   {
+      //     std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
+
+      //     // string elementType = printCUDAType(op->var.type(), false);
+      //     // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
+      //   }
+      // }
+      
       Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
       numThreads.push_back(blockSize);
     }
@@ -378,6 +391,20 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   }
 
   virtual void visit(const Var *op) {
+
+    if (isa<Var>(op)){
+        // std::cout << "~~~~~ Found a var! " << to<Var>(op)->name << std::endl;
+        if (to<Var>(op)->is_shared_memory)
+        {
+          std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
+          // std::cout << to<Expr>(op) << std::endl;
+          string elementType = printCUDAType( op->type, false);
+          std::cout << "element type : " << elementType << std::endl;
+          typeSharedMemory = elementType;
+          // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
+        }
+      }
+
     if (scopeMap.count(op) == 0) {
       string name = codeGen->genUniqueName(op->name);
       if (!inDeviceFunction) {
@@ -395,6 +422,20 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   }
 
   virtual void visit(const VarDecl *op) {
+
+    // if (isa<Var>(op->var)){
+    //     std::cout << "@@@@ Found a var decl! " << to<Var>(op->var)->name << std::endl;
+    //     if (to<Var>(op->var)->is_shared_memory)
+    //     {
+    //       std::cout << "@@@@@@@ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
+    //       // std::cout << to<Expr>(op) << std::endl;
+    //       // string elementType = printCUDAType( op->type, false);
+    //       // std::cout << "element type : " << elementType << std::endl;
+    //       // typeSharedMemory = elementType;
+    //       // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
+    //     }
+    //   }
+
     if (inDeviceFunction) {
       variablesDeclaredInKernel.insert(op->var);
     }
@@ -580,7 +621,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
   gridSize.accept(this);
   stream << ", ";
   blockSize.accept(this);
-  stream << ">>>";
+
+  if (usesshared == false)
+  {
+    stream << ">>>";
+  }
+  else
+  {
+    // BIG TODO: no hard code 2048
+    // Should be num_threads * num_prec_elems * sizeof(prec_type)
+    stream << ", " ;
+    sizeofshared.accept(this);
+    stream << " * sizeof(" << typeofshared << ")>>>"; 
+    // 2048*sizeof(double)>>>";
+  }
+  
+
+  
   stream << "(";
 
   string delimiter = "";
@@ -627,6 +684,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) {
   parentParallelUnits = {};
   parallelUnitSizes = {};
   parallelUnitIDVars = {};
+  sizeofshared = Expr();
+  typeofshared = "";
+  usesshared = false;
   emittedTimerStartCode = false;
   isHostFunction = true;
   if (isFirst) {
@@ -1021,8 +1081,107 @@ void CodeGen_CUDA::visit(const Max* op) {
 
 void CodeGen_CUDA::visit(const Allocate* op) {
   string elementType = printCUDAType(op->var.type(), false);
+
+  // std::cout << op->var << std::endl;
+  if (isa<Var>(op->var)){
+    if (to<Var>(op->var)->name == "precomputed")
+    {
+      std::cout << " found precomputed!! " << std::endl;
+      // std::cout << "var : " << to<Var>(op->var) << std::endl;
+      // std::cout << "*var : " << *(to<Var>(op->var)) << std::endl;
+      // stream << "__shared__" << " ";
+    }
+    if (to<Var>(op->var)->is_shared_memory)
+    {
+      std::cout << "HEY IT IS MARKED AS SHARED MEMORY" << std::endl;
+      // doIndent();
+      // stream << "__shared__ ";
+    }
+    else
+    {
+      std::cout << "NOT marked as shared memory" << std::endl;
+    }
+    
+    // __syncthreads() ??;
+  }
+
   if (!isHostFunction) {
-    if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
+
+    if (to<Var>(op->var)->is_shared_memory)
+    {
+      std::cout << "AT visit allocate, " << to<Var>(op->var)->name << " is shared mem" << std::endl;
+// __shared__ double w_GPUThread[32]; if no warps
+    // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
+    // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
+
+    // Example.. todo: figure this out and put it back
+    taco_iassert(!op->is_realloc);
+    doIndent();
+    stream << "__shared__ " << elementType << " ";
+    op->var.accept(this);
+    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+      stream << "_ALL";
+    }
+    stream << "[";
+    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+      // Expr numElements = Mul::make(op->num_elements, 
+      //       Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], 
+      //       parallelUnitSizes[ParallelUnit::GPUWarp]));
+      // Expr numElements = Mul::make(op->num_elements, 
+      //       parallelUnitSizes[ParallelUnit::GPUWarp]);
+      Expr numElements = Mul::make(op->num_elements, 
+            parallelUnitSizes[ParallelUnit::GPUBlock]);
+            // BIG TODO: remove hard coded 8
+      // std::cout << "######warp " << parallelUnitSizes[ParallelUnit::GPUWarp] << std::endl;
+      // std::cout << "######block " << parallelUnitSizes[ParallelUnit::GPUBlock] << std::endl;
+      // std::cout << "######threads " << parallelUnitSizes[ParallelUnit::GPUThread] << std::endl;
+      // std::cout << "######warpred " << parallelUnitSizes[ParallelUnit::GPUWarpReduction] << std::endl;
+      // std::cout << "######blockred " << parallelUnitSizes[ParallelUnit::GPUBlockReduction] << std::endl;
+      
+      // std::pair<Expr,std::string> pr = ;
+      sizeofshared = numElements;
+      typeofshared = elementType;
+      usesshared = true;
+      // sharedMemTypes.push_back(std::make_pair(op->var, elementType));
+      // std::cout << "var type from map: " << sharedMemTypes[0].second << std::endl;
+      // sharedMemSizes.push_back(std::make_pair(op->var, numElements));
+      // std::cout << "var numElem from map: " << sharedMemSizes[0].second << std::endl;
+
+
+
+
+      ir::simplify(numElements).accept(this);
+    }
+    else {
+      std::cout << "AT visit allocate, " << to<Var>(op->var)->name << " is *** NOT*** shared mem" << std::endl;
+      doIndent();
+      stream << elementType << " ";
+      op->var.accept(this);
+      stream << "[";
+      op->num_elements.accept(this);
+      stream << "];" << endl;
+      return;
+    }
+    stream << "];" << endl;
+    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+      doIndent();
+      stream << elementType << " * ";
+      op->var.accept(this);
+
+      stream << " = ";
+      op->var.accept(this);
+      stream << "_ALL + threadIdx.x";
+      // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+      stream << " * ";
+      op->num_elements.accept(this);
+      // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+      stream << ";" << endl;
+    }
+    return;
+    }
+    else
+    {
+      if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
       // double w_GPUThread[num];
       // for threads allocate thread local memory
       doIndent();
@@ -1066,6 +1225,23 @@ void CodeGen_CUDA::visit(const Allocate* op) {
       stream << ";" << endl;
     }
     return;
+    }
+    
+
+    
+
+    // if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
+    //   // double w_GPUThread[num];
+    //   // for threads allocate thread local memory
+    //   doIndent();
+    //   stream << elementType << " ";
+    //   op->var.accept(this);
+    //   stream << "[";
+    //   op->num_elements.accept(this);
+    //   stream << "];" << endl;
+    //   return;
+    // }
+    
   }
   string variable_name;
   if (op->is_realloc) {
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 2bc8e000d..654344b39 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen {
   std::map<ParallelUnit, Expr> parallelUnitSizes;
   std::map<ParallelUnit, Expr> parallelUnitIDVars;
 
+
+  Expr sizeofshared;
+  std::string typeofshared;
+  bool usesshared;
+
   bool emittedTimerStartCode = false;
 
   std::ostream &out;
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index fc52c409d..6c84bc56d 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -164,6 +164,51 @@ string Module::compile() {
   return fullpath;
 }
 
+string Module::recompile(string file_path) {
+  string prefix = file_path;
+  string fullpath = prefix + ".so";
+  
+  string cc;
+  string cflags;
+  string file_ending;
+  string shims_file;
+  if (should_use_CUDA_codegen()) {
+    cc = "nvcc";
+    cflags = util::getFromEnv("TACO_NVCCFLAGS",
+    get_default_CUDA_compiler_flags());
+    file_ending = ".cu";
+    shims_file = prefix + "_shims.cpp";
+  }
+  else {
+    cc = util::getFromEnv(target.compiler_env, target.compiler);
+    cflags = util::getFromEnv("TACO_CFLAGS",
+    "-O3 -ffast-math -std=c99") + " -shared -fPIC";
+    file_ending = ".c";
+    shims_file = "";
+  }
+#if USE_OPENMP
+  cflags += " -fopenmp";
+#endif
+  
+  string cmd = cc + " " + cflags + " " +
+    prefix + file_ending + " " + shims_file + " " + 
+    "-o " + fullpath + " -lm";
+  
+  // now compile it
+  int err = system(cmd.data());
+  taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+    << "\nreturned " << err;
+
+  // use dlsym() to open the compiled library
+  if (lib_handle) {
+    dlclose(lib_handle);
+  }
+  lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL);
+  taco_uassert(lib_handle) << "Failed to load generated code";
+
+  return fullpath;
+}
+
 void Module::setSource(string source) {
   this->source << source;
   moduleFromUserSource = true;
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 7035fc034..c199e2389 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -1035,6 +1035,27 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa
   return *this;
 }
 
+IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const {
+  if (shared_mem){
+    std::cout << "is shared_mem " << __FILE__ << " " << __LINE__ << std::endl;
+  }
+  IndexStmt transformed = *this;
+  string reason;
+  if (i != iw) {
+    IndexVarRel rel = IndexVarRel(new PrecomputeRelNode(i, iw));
+    transformed = Transformation(AddSuchThatPredicates({rel})).apply(transformed, &reason);
+    if (!transformed.defined()) {
+      taco_uerror << reason;
+    }
+  }
+
+  transformed = Transformation(Precompute(expr, i, iw, workspace, shared_mem)).apply(transformed, &reason);
+  if (!transformed.defined()) {
+    taco_uerror << reason;
+  }
+  return transformed;
+}
+
 IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const {
   IndexStmt transformed = *this;
   string reason;
@@ -1486,6 +1507,7 @@ struct TensorVar::Content {
   Type type;
   Format format;
   Schedule schedule;
+  bool shared_memory = false;
 };
 
 TensorVar::TensorVar() : content(nullptr) {
@@ -1514,6 +1536,18 @@ TensorVar::TensorVar(const string& name, const Type& type, const Format& format)
   content->format = format;
 }
 
+TensorVar::TensorVar(const string& name, const Type& type, const Format& format, bool is_shared_mem)
+    : content(new Content) {
+  content->name = name;
+  content->type = type;
+  content->format = format;
+  content->shared_memory = is_shared_mem;
+}
+
+bool TensorVar::is_shared_memory() {
+  return content->shared_memory;
+}
+
 std::string TensorVar::getName() const {
   return content->name;
 }
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 752416955..f1a1eae00 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -133,6 +133,7 @@ struct Precompute::Content {
   IndexVar i;
   IndexVar iw;
   TensorVar workspace;
+  bool is_shared_mem = false;
 };
 
 Precompute::Precompute() : content(nullptr) {
@@ -144,6 +145,16 @@ Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw,
   content->i = i;
   content->iw = iw;
   content->workspace = workspace;
+  content->is_shared_mem = false;
+}
+
+Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw,
+                     TensorVar workspace, bool shared_mem) : content(new Content) {
+  content->expr = expr;
+  content->i = i;
+  content->iw = iw;
+  content->workspace = workspace;
+  content->is_shared_mem = shared_mem;
 }
 
 IndexExpr Precompute::getExpr() const {
@@ -235,11 +246,25 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
         IndexStmt s = foralli.getStmt();
         TensorVar ws = precompute.getWorkspace();
         IndexExpr e = precompute.getExpr();
+        std::cout << __FILE__ << ": " << __LINE__ << "precompute getExpr e = " << e << std::endl;
+
+        if (ws.is_shared_memory()){
+          std::cout << __FILE__ << __LINE__ << " workspace is shared memory! " << std::endl;
+        }
+
         IndexVar iw = precompute.getiw();
+        std::cout << "precompute getiw iw = " << iw << std::endl;
+        std::cout << "precompute ws = " << ws << std::endl;
+        std::cout << "precompute s = " << s << std::endl;
 
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
+        std::cout << "precompute consumer = " << consumer << std::endl;
+
         IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}}));
+        std::cout << "precompute producer = " << producer << std::endl;
+
         Where where(consumer, producer);
+        std::cout << "precompute where = " << where << std::endl;
 
         stmt = where;
         return;
diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
index dbe941fe6..7656d9b26 100644
--- a/src/ir/ir.cpp
+++ b/src/ir/ir.cpp
@@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const {
   return false;
 }
 
-Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
+Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, bool is_shared_mem) {
   Var *var = new Var;
   var->type = type;
   var->name = name;
@@ -249,6 +249,12 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
   // TODO: is_ptr and is_tensor should be part of type
   var->is_ptr = is_ptr;
   var->is_tensor = is_tensor;
+  var->is_shared_memory = is_shared_mem;
+
+  if (var->is_shared_memory)
+  {
+    std::cout << var->name << " is shared memory" << __FILE__ << __LINE__ << std::endl;
+  }
 
   return var;
 }
@@ -644,6 +650,10 @@ Stmt For::make(Expr var, Expr start, Expr end, Expr increment, Stmt body,
   LoopKind kind, ParallelUnit parallel_unit, size_t unrollFactor, int vec_width) {
   For *loop = new For;
   loop->var = var;
+  if (var.as<Var>() && var.as<Var>()->name == "pprecomputed"){
+    std::cout << " vAr is named Pprecomputed " << std::endl;
+  }
+  
   loop->start = start;
   loop->end = end;
   loop->increment = increment;
@@ -753,6 +763,13 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme
       "Can only allocate memory for a pointer-typed Var";
   taco_iassert(num_elements.type().isInt() || num_elements.type().isUInt()) <<
       "Can only allocate an integer-valued number of elements";
+
+  if (var.as<Var>() && var.as<Var>()->is_shared_memory){
+    std::cout << " vAr is Marked as shaRed memory, nice! " << std::endl;
+  }
+  if (var.as<Var>() && var.as<Var>()->name == "precomputed"){
+    std::cout << " vAr is named precomputed " << std::endl;
+  }
   Allocate* alloc = new Allocate;
   alloc->var = var;
   alloc->num_elements = num_elements;
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index acc1b11bf..32ee42f07 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -107,6 +107,7 @@ static bool hasStores(Stmt stmt) {
 Stmt
 LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
 {
+  std::cout << "file, line, function " << __FILE__ << " " << __LINE__ << " lower\n";
   this->assemble = assemble;
   this->compute = compute;
   definedIndexVarsOrdered = {};
@@ -126,9 +127,21 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
   // Create variables for temporaries
   // TODO Remove this
   for (auto& temp : temporaries) {
-    ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
+    std::cout << __FILE__ << " " << __LINE__ << " temp: " << temp << std::endl;
+    if (((TensorVar)(temp)).is_shared_memory()){
+      std::cout  << __FILE__ << " " << __LINE__ << "TENSOR VAR IS SHARED MEMORY" << std::endl;
+      ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
+                                   true, true, true);
+        tensorVars.insert({temp, irVar});
+    }
+    else{
+      ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
                                    true, true);
-    tensorVars.insert({temp, irVar});
+        std::cout  << __FILE__ << " " << __LINE__ << "TENSOR VAR IS **NOT** SHARED MEMORY" << std::endl;
+        tensorVars.insert({temp, irVar});
+    }
+    
+    
   }
 
   // Create variables for keeping track of result values array capacity
@@ -1278,14 +1291,34 @@ Stmt LowererImpl::lowerWhere(Where where) {
   }
   else {
     if (generateComputeCode()) {
-      Expr values = ir::Var::make(temporary.getName(),
+      Expr values;
+      if (temporary.is_shared_memory()){
+        std::cout << "~~~~~~ at lower where " << temporary.getName() << " is marked as shared. " << std::endl;
+        values = ir::Var::make(temporary.getName(),
+                                  temporary.getType().getDataType(),
+                                  true, false, true);
+      }
+      else
+      {
+        std::cout << "~~~~~~ at lower where " << temporary.getName() << " is **NOT** marked as shared. " << std::endl;
+
+        values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
                                   true, false);
+      }
+      
+      
       taco_iassert(temporary.getType().getOrder() == 1); // TODO
       Dimension temporarySize = temporary.getType().getShape().getDimension(0);
       Expr size;
       if (temporarySize.isFixed()) {
-        size = ir::Literal::make(temporarySize.getSize());
+        // if (temporary.is_shared_memory()){
+          size = ir::Literal::make(temporarySize.getSize());
+        // }
+        // else{
+          // size = ir::Literal::make(temporarySize.getSize());
+        // }
+        // size = ir::Literal::make(temporarySize.getSize());
       }
       else if (temporarySize.isIndexVarSized()) {
         IndexVar var = temporarySize.getIndexVarSize();
@@ -1297,6 +1330,7 @@ Stmt LowererImpl::lowerWhere(Where where) {
       }
 
       // no decl needed for shared memory
+      // ????
       Stmt decl = Stmt();
       if((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
         decl = VarDecl::make(values, ir::Literal::make(0));
@@ -1305,6 +1339,9 @@ Stmt LowererImpl::lowerWhere(Where where) {
 
       Expr p = Var::make("p" + temporary.getName(), Int());
       Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+
+      // if shared mem, do something
+
       Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
 
       freeTemporary = Free::make(values);
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 5efba93cc..7787d54eb 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -408,6 +408,10 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
   content->module->compile();
 }
 
+void TensorBase::recompile(std::string file_path) {
+  content->module->recompile(file_path);
+}
+
 taco_tensor_t* TensorBase::getTacoTensorT() {
   return getStorage();
 }
@@ -473,6 +477,17 @@ void TensorBase::assemble() {
   }
 }
 
+void TensorBase::reassemble() {
+
+  auto arguments = packArguments(*this);
+  content->module->callFuncPacked("assemble", arguments.data());
+
+  if (!content->assembleWhileCompute) {
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::compute() {
   taco_uassert(this->content->computeFunc.defined())
       << error::compute_without_compile;
@@ -486,6 +501,17 @@ void TensorBase::compute() {
   }
 }
 
+void TensorBase::recompute() {
+
+  auto arguments = packArguments(*this);
+  this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::evaluate() {
   this->compile();
   if (!getAssignment().getOperator().defined()) {
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 5be64a4ff..0741e6542 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -129,6 +129,25 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precompute
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
+IndexStmt scheduleSpMVGPU_wsp(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) {
+  int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE;
+  int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE;
+  IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre");
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, true);
+  return stmt.fuse(i, j, f)
+          .pos(f, fpos, A(i, j))
+          .split(fpos, block, fpos1, NNZ_PER_TB)
+          .split(fpos1, warp, fpos2, NNZ_PER_WARP)
+          .split(fpos2, thread, thread_nz, NNZ_PER_THREAD)
+          .reorder({block, warp, thread, thread_nz})
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, true)
+          // .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
+          .unroll(thread_nz_pre, NNZ_PER_THREAD)
+          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
+}
+
 IndexStmt scheduleSpMVRowsGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int ROWS_PER_WARP=1, int BLOCK_SIZE=256) {
   int ROWS_PER_TB = ROWS_PER_WARP * BLOCK_SIZE;
   IndexVar block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), i1("i1"), jpos("jpos"), block_row("block_row"), warp_row("warp_row");
@@ -188,6 +207,27 @@ IndexStmt scheduleSpMMGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precompute
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
+IndexStmt scheduleSpMMGPU_wsp(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_WARP=8, int BLOCK_SIZE=256) {
+  int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
+  IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), nnz_pre("nnz_pre");
+  IndexVar dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread");
+  IndexVar thread_nz("thread_nz");
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(nnz)}), taco::dense);
+  return stmt.reorder({i, j, k})
+          .fuse(i, j, f)
+          .pos(f, fpos, A(i, j))
+          .split(fpos, block, fpos1, NNZ_PER_TB)
+          .split(fpos1, warp, nnz, NNZ_PER_WARP)
+          .split(k, dense_val_unbounded, thread, WARP_SIZE)
+          .reorder({block, warp, thread, dense_val_unbounded, nnz})
+          //.precompute(precomputedExpr, nnz, nnz, precomputed)
+          .bound(dense_val_unbounded, dense_val, 4, BoundType::MaxExact)
+          //.unroll(dense_val, 4)
+          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
+}
+
 IndexStmt scheduleSDDMMGPU(IndexStmt stmt, Tensor<double> B, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256, int CO_FACTOR=4) {
   int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
   IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz");
@@ -797,6 +837,124 @@ TEST(scheduling_eval, spmvGPU) {
   ASSERT_TENSOR_EQ(expected, y);
 }
 
+TEST(scheduling_eval, smpreGPU) {
+  if (!should_use_CUDA_codegen()) {
+    std::cout << "not using cuda" << std::endl;
+    return;
+  }
+  int NUM_I = 425;
+  int NUM_J = 425;
+  float SPARSITY = .19;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, {Dense});
+  Tensor<double> y("y", {NUM_I}, {Dense});
+
+  srand(94353);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double)rand_float);
+  }
+
+  x.pack();
+  A.pack();
+  IndexExpr precomputed = A(i, j) * x(j);
+  y(i) = precomputed;
+  // std::cout << y(i) << std::endl;
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  // stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed);
+  stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed);
+  std::cout << "stmt:" << std::endl;
+  std::cout << stmt << std::endl;
+
+  //printToFile("spmv_gpu", stmt);
+
+  // y.compile(stmt);
+  // Just overload th compile 
+
+  y.compile(stmt);
+  y.assemble();
+  y.compute();
+
+  // y.recompile("/tmp/taco_tmp_Ryt3Xk/vdhf888td940");
+  // y.reassemble();
+  // y.recompute();
+
+  Tensor<double> expected("expected", {NUM_I}, {Dense});
+  expected(i) = A(i, j) * x(j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, y);
+}
+
+TEST(scheduling_eval, spmmGPU_wsp) {
+
+ if (!should_use_CUDA_codegen()) {
+    std::cout << "not using cuda" << std::endl;
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 128;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> B("B", {NUM_J, NUM_K}, {Dense, Dense});
+  Tensor<double> C("C", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}}));
+
+  srand(4343211);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  B.pack();
+  IndexExpr precomputed = A(i, j);
+  C(i, k) = B(j, k) * precomputed;
+
+  IndexStmt stmt = C.getAssignment().concretize();
+  stmt = scheduleSpMMGPU_wsp(stmt, A, precomputed);
+
+  //printToFile("spmm_gpu", stmt);
+
+  C.compile(stmt);
+  C.assemble();
+  C.compute();
+
+  // C.recompile("/tmp/taco_tmp_HT60Vh/vdhf888td940");
+  // C.reassemble();
+  // C.recompute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}}));
+  expected(i, k) = A(i, j) * B(j, k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, C);
+}
+
+
 TEST(scheduling_eval, spmmGPU) {
   if (!should_use_CUDA_codegen()) {
     return;

From 83169e12f3dfefd82ba921381af78401d126ebfa Mon Sep 17 00:00:00 2001
From: Amalee Wilson <amalee@cs.stanford.edu>
Date: Sun, 21 Jun 2020 22:46:26 -0700
Subject: [PATCH 2/4] First pass to add support for dense shared memory
 workspaces for GPUs

---
 include/taco/codegen/module.h                 |   5 +-
 include/taco/index_notation/index_notation.h  |  10 +-
 include/taco/index_notation/transformations.h |   3 +-
 include/taco/ir/ir.h                          |   4 +-
 include/taco/ir_tags.h                        |   6 +
 src/codegen/codegen_cuda.cpp                  | 233 ++++++------------
 src/index_notation/index_notation.cpp         |  42 +---
 src/index_notation/transformations.cpp        |  28 +--
 src/ir/ir.cpp                                 |  20 +-
 src/ir_tags.cpp                               |   1 +
 src/lower/lowerer_impl.cpp                    |  26 +-
 test/tests-scheduling-eval.cpp                | 172 ++++++-------
 12 files changed, 193 insertions(+), 357 deletions(-)

diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 4b6aa6f37..3c26cc9f1 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -26,7 +26,10 @@ class Module {
   /// Compile the source into a library, returning its full path
   std::string compile();
 
-  // Recompile 
+  /// Recompile. This is a debugging tool that, given the path to the temporary
+  /// file generated by taco, will compile the file. This function is useful
+  /// for development and facilitates experimentation with generated code by
+  /// allowing developers to modify the generated code and compile it again. 
   std::string recompile(std::string file_path);
   
   /// Compile the module into a source file located at the specified location
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index 153f093f2..096ac6756 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -622,8 +622,7 @@ class IndexStmt : public util::IntrusivePtr<const IndexStmtNode> {
   ///  The precompute transformation is described in kjolstad2019
   ///  allows us to leverage scratchpad memories and
   ///  reorder computations to increase locality
-  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const;
-  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const;
+  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None) const;
 
   /// bound specifies a compile-time constraint on an index variable's
   /// iteration space that allows knowledge of the
@@ -850,10 +849,11 @@ class TensorVar : public util::Comparable<TensorVar> {
   TensorVar(const Type& type);
   TensorVar(const std::string& name, const Type& type);
   TensorVar(const Type& type, const Format& format);
-  TensorVar(const std::string& name, const Type& type, const Format& format);
-  TensorVar(const std::string& name, const Type& type, const Format& format, bool is_shared_mem);
+  // TensorVar(const std::string& name, const Type& type, const Format& format);
+  TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
-  bool is_shared_memory();
+  // Returns the type of GPU workspace this TensorVar is, which is None by default. 
+  GPUWorkspace getGPUWorkspace();
 
   /// Returns the name of the tensor variable.
   std::string getName() const;
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index df29b7feb..f91783e56 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -86,8 +86,7 @@ std::ostream &operator<<(std::ostream &, const Reorder &);
 class Precompute : public TransformationInterface {
 public:
   Precompute();
-  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace);
-  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem);
+  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
   IndexExpr getExpr() const;
   IndexVar geti() const;
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 56f186d66..2befde4ed 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -254,10 +254,10 @@ struct Var : public ExprNode<Var> {
   std::string name;
   bool is_ptr;
   bool is_tensor;
-  bool is_shared_memory = false;
+  GPUWorkspace gpuworkspace;
 
   static Expr make(std::string name, Datatype type, bool is_ptr=false, 
-                   bool is_tensor=false, bool is_shared_memory=false);
+                   bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
   static const IRNodeType _type_info = IRNodeType::Var;
 };
diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h
index 2b1a4a4f1..39ed25eb4 100644
--- a/include/taco/ir_tags.h
+++ b/include/taco/ir_tags.h
@@ -29,4 +29,10 @@ enum class BoundType {
 extern const char *BoundType_NAMES[];
 }
 
+/* TODO: Not sure if this is the right place for these. */
+enum class GPUWorkspace {
+  None, DenseSharedMemory
+};
+extern const char *GPUWorkspace_NAMES[];
+
 #endif //TACO_IR_TAGS_H
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index b7e59f691..a08f89d7a 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -361,16 +361,6 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
 
       threadFors.push_back(op);
       threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
-      // if (isa<Var>(op->var)){
-      //   std::cout << "~~~~~ Found a var! " << to<Var>(op->var)->name << std::endl;
-      //   if (to<Var>(op->var)->is_shared_memory)
-      //   {
-      //     std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
-
-      //     // string elementType = printCUDAType(op->var.type(), false);
-      //     // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
-      //   }
-      // }
       
       Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
       numThreads.push_back(blockSize);
@@ -393,13 +383,9 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   virtual void visit(const Var *op) {
 
     if (isa<Var>(op)){
-        // std::cout << "~~~~~ Found a var! " << to<Var>(op)->name << std::endl;
-        if (to<Var>(op)->is_shared_memory)
+        if (to<Var>(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
         {
-          std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
-          // std::cout << to<Expr>(op) << std::endl;
           string elementType = printCUDAType( op->type, false);
-          std::cout << "element type : " << elementType << std::endl;
           typeSharedMemory = elementType;
           // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
         }
@@ -422,20 +408,6 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   }
 
   virtual void visit(const VarDecl *op) {
-
-    // if (isa<Var>(op->var)){
-    //     std::cout << "@@@@ Found a var decl! " << to<Var>(op->var)->name << std::endl;
-    //     if (to<Var>(op->var)->is_shared_memory)
-    //     {
-    //       std::cout << "@@@@@@@ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl;
-    //       // std::cout << to<Expr>(op) << std::endl;
-    //       // string elementType = printCUDAType( op->type, false);
-    //       // std::cout << "element type : " << elementType << std::endl;
-    //       // typeSharedMemory = elementType;
-    //       // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
-    //     }
-    //   }
-
     if (inDeviceFunction) {
       variablesDeclaredInKernel.insert(op->var);
     }
@@ -1082,150 +1054,101 @@ void CodeGen_CUDA::visit(const Max* op) {
 void CodeGen_CUDA::visit(const Allocate* op) {
   string elementType = printCUDAType(op->var.type(), false);
 
-  // std::cout << op->var << std::endl;
-  if (isa<Var>(op->var)){
-    if (to<Var>(op->var)->name == "precomputed")
-    {
-      std::cout << " found precomputed!! " << std::endl;
-      // std::cout << "var : " << to<Var>(op->var) << std::endl;
-      // std::cout << "*var : " << *(to<Var>(op->var)) << std::endl;
-      // stream << "__shared__" << " ";
-    }
-    if (to<Var>(op->var)->is_shared_memory)
-    {
-      std::cout << "HEY IT IS MARKED AS SHARED MEMORY" << std::endl;
-      // doIndent();
-      // stream << "__shared__ ";
-    }
-    else
-    {
-      std::cout << "NOT marked as shared memory" << std::endl;
-    }
-    
-    // __syncthreads() ??;
-  }
-
   if (!isHostFunction) {
 
-    if (to<Var>(op->var)->is_shared_memory)
+    if (to<Var>(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
     {
-      std::cout << "AT visit allocate, " << to<Var>(op->var)->name << " is shared mem" << std::endl;
-// __shared__ double w_GPUThread[32]; if no warps
-    // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
-    // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
-
-    // Example.. todo: figure this out and put it back
-    taco_iassert(!op->is_realloc);
-    doIndent();
-    stream << "__shared__ " << elementType << " ";
-    op->var.accept(this);
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      stream << "_ALL";
-    }
-    stream << "[";
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      // Expr numElements = Mul::make(op->num_elements, 
-      //       Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], 
-      //       parallelUnitSizes[ParallelUnit::GPUWarp]));
-      // Expr numElements = Mul::make(op->num_elements, 
-      //       parallelUnitSizes[ParallelUnit::GPUWarp]);
-      Expr numElements = Mul::make(op->num_elements, 
-            parallelUnitSizes[ParallelUnit::GPUBlock]);
-            // BIG TODO: remove hard coded 8
-      // std::cout << "######warp " << parallelUnitSizes[ParallelUnit::GPUWarp] << std::endl;
-      // std::cout << "######block " << parallelUnitSizes[ParallelUnit::GPUBlock] << std::endl;
-      // std::cout << "######threads " << parallelUnitSizes[ParallelUnit::GPUThread] << std::endl;
-      // std::cout << "######warpred " << parallelUnitSizes[ParallelUnit::GPUWarpReduction] << std::endl;
-      // std::cout << "######blockred " << parallelUnitSizes[ParallelUnit::GPUBlockReduction] << std::endl;
-      
-      // std::pair<Expr,std::string> pr = ;
-      sizeofshared = numElements;
-      typeofshared = elementType;
-      usesshared = true;
-      // sharedMemTypes.push_back(std::make_pair(op->var, elementType));
-      // std::cout << "var type from map: " << sharedMemTypes[0].second << std::endl;
-      // sharedMemSizes.push_back(std::make_pair(op->var, numElements));
-      // std::cout << "var numElem from map: " << sharedMemSizes[0].second << std::endl;
-
-
-
-
-      ir::simplify(numElements).accept(this);
-    }
-    else {
-      std::cout << "AT visit allocate, " << to<Var>(op->var)->name << " is *** NOT*** shared mem" << std::endl;
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
       stream << "[";
-      op->num_elements.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, 
+              parallelUnitSizes[ParallelUnit::GPUBlock]);
+
+        sizeofshared = numElements;
+        typeofshared = elementType;
+        usesshared = true;
+
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
       stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + threadIdx.x";
+        // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        op->num_elements.accept(this);
+        // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
       return;
     }
-    stream << "];" << endl;
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      doIndent();
-      stream << elementType << " * ";
-      op->var.accept(this);
-
-      stream << " = ";
-      op->var.accept(this);
-      stream << "_ALL + threadIdx.x";
-      // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
-      stream << " * ";
-      op->num_elements.accept(this);
-      // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
-      stream << ";" << endl;
-    }
-    return;
-    }
     else
     {
       if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
-      // double w_GPUThread[num];
-      // for threads allocate thread local memory
+        // double w_GPUThread[num];
+        // for threads allocate thread local memory
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
+      // __shared__ double w_GPUThread[32]; if no warps
+      // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
+      // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
       stream << "[";
-      op->num_elements.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        op->num_elements.accept(this);
+      }
       stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + ";
+        parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
       return;
     }
-    // __shared__ double w_GPUThread[32]; if no warps
-    // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
-    // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
-    taco_iassert(!op->is_realloc);
-    doIndent();
-    stream << "__shared__ " << elementType << " ";
-    op->var.accept(this);
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      stream << "_ALL";
-    }
-    stream << "[";
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
-      ir::simplify(numElements).accept(this);
-    }
-    else {
-      op->num_elements.accept(this);
-    }
-    stream << "];" << endl;
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      doIndent();
-      stream << elementType << " * ";
-      op->var.accept(this);
-
-      stream << " = ";
-      op->var.accept(this);
-      stream << "_ALL + ";
-      parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
-      stream << " * ";
-      parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
-      stream << ";" << endl;
-    }
-    return;
-    }
     
 
     
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 48cd8b557..9a5baf677 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -1377,10 +1377,7 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa
   return *this;
 }
 
-IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const {
-  if (shared_mem){
-    std::cout << "is shared_mem " << __FILE__ << " " << __LINE__ << std::endl;
-  }
+IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace) const {
   IndexStmt transformed = *this;
   string reason;
   if (i != iw) {
@@ -1391,25 +1388,7 @@ IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorV
     }
   }
 
-  transformed = Transformation(Precompute(expr, i, iw, workspace, shared_mem)).apply(transformed, &reason);
-  if (!transformed.defined()) {
-    taco_uerror << reason;
-  }
-  return transformed;
-}
-
-IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const {
-  IndexStmt transformed = *this;
-  string reason;
-  if (i != iw) {
-    IndexVarRel rel = IndexVarRel(new PrecomputeRelNode(i, iw));
-    transformed = Transformation(AddSuchThatPredicates({rel})).apply(transformed, &reason);
-    if (!transformed.defined()) {
-      taco_uerror << reason;
-    }
-  }
-
-  transformed = Transformation(Precompute(expr, i, iw, workspace)).apply(transformed, &reason);
+  transformed = Transformation(Precompute(expr, i, iw, workspace, gpuworkspace)).apply(transformed, &reason);
   if (!transformed.defined()) {
     taco_uerror << reason;
   }
@@ -1840,7 +1819,7 @@ struct TensorVar::Content {
   Type type;
   Format format;
   Schedule schedule;
-  bool shared_memory = false;
+  GPUWorkspace gpuworkspace;
 };
 
 TensorVar::TensorVar() : content(nullptr) {
@@ -1862,23 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format)
     : TensorVar(util::uniqueName('A'), type, format) {
 }
 
-TensorVar::TensorVar(const string& name, const Type& type, const Format& format)
-    : content(new Content) {
-  content->name = name;
-  content->type = type;
-  content->format = format;
-}
-
-TensorVar::TensorVar(const string& name, const Type& type, const Format& format, bool is_shared_mem)
+TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace)
     : content(new Content) {
   content->name = name;
   content->type = type;
   content->format = format;
-  content->shared_memory = is_shared_mem;
+  content->gpuworkspace = gpuworkspace;
 }
 
-bool TensorVar::is_shared_memory() {
-  return content->shared_memory;
+GPUWorkspace TensorVar::getGPUWorkspace() {
+  return content->gpuworkspace;
 }
 
 std::string TensorVar::getName() const {
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index f1a1eae00..60f2bf8a0 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -133,28 +133,19 @@ struct Precompute::Content {
   IndexVar i;
   IndexVar iw;
   TensorVar workspace;
-  bool is_shared_mem = false;
+  GPUWorkspace gpuworkspace = GPUWorkspace::None;
 };
 
 Precompute::Precompute() : content(nullptr) {
 }
 
 Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw,
-                     TensorVar workspace) : content(new Content) {
+                     TensorVar workspace, GPUWorkspace gpuworkspace) : content(new Content) {
   content->expr = expr;
   content->i = i;
   content->iw = iw;
   content->workspace = workspace;
-  content->is_shared_mem = false;
-}
-
-Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw,
-                     TensorVar workspace, bool shared_mem) : content(new Content) {
-  content->expr = expr;
-  content->i = i;
-  content->iw = iw;
-  content->workspace = workspace;
-  content->is_shared_mem = shared_mem;
+  content->gpuworkspace = gpuworkspace;
 }
 
 IndexExpr Precompute::getExpr() const {
@@ -246,25 +237,12 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
         IndexStmt s = foralli.getStmt();
         TensorVar ws = precompute.getWorkspace();
         IndexExpr e = precompute.getExpr();
-        std::cout << __FILE__ << ": " << __LINE__ << "precompute getExpr e = " << e << std::endl;
-
-        if (ws.is_shared_memory()){
-          std::cout << __FILE__ << __LINE__ << " workspace is shared memory! " << std::endl;
-        }
-
         IndexVar iw = precompute.getiw();
-        std::cout << "precompute getiw iw = " << iw << std::endl;
-        std::cout << "precompute ws = " << ws << std::endl;
-        std::cout << "precompute s = " << s << std::endl;
 
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
-        std::cout << "precompute consumer = " << consumer << std::endl;
-
         IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}}));
-        std::cout << "precompute producer = " << producer << std::endl;
 
         Where where(consumer, producer);
-        std::cout << "precompute where = " << where << std::endl;
 
         stmt = where;
         return;
diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
index 7656d9b26..60d7d24f6 100644
--- a/src/ir/ir.cpp
+++ b/src/ir/ir.cpp
@@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const {
   return false;
 }
 
-Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, bool is_shared_mem) {
+Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) {
   Var *var = new Var;
   var->type = type;
   var->name = name;
@@ -249,12 +249,7 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, boo
   // TODO: is_ptr and is_tensor should be part of type
   var->is_ptr = is_ptr;
   var->is_tensor = is_tensor;
-  var->is_shared_memory = is_shared_mem;
-
-  if (var->is_shared_memory)
-  {
-    std::cout << var->name << " is shared memory" << __FILE__ << __LINE__ << std::endl;
-  }
+  var->gpuworkspace = gpuworkspace;
 
   return var;
 }
@@ -650,10 +645,6 @@ Stmt For::make(Expr var, Expr start, Expr end, Expr increment, Stmt body,
   LoopKind kind, ParallelUnit parallel_unit, size_t unrollFactor, int vec_width) {
   For *loop = new For;
   loop->var = var;
-  if (var.as<Var>() && var.as<Var>()->name == "pprecomputed"){
-    std::cout << " vAr is named Pprecomputed " << std::endl;
-  }
-  
   loop->start = start;
   loop->end = end;
   loop->increment = increment;
@@ -763,13 +754,6 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme
       "Can only allocate memory for a pointer-typed Var";
   taco_iassert(num_elements.type().isInt() || num_elements.type().isUInt()) <<
       "Can only allocate an integer-valued number of elements";
-
-  if (var.as<Var>() && var.as<Var>()->is_shared_memory){
-    std::cout << " vAr is Marked as shaRed memory, nice! " << std::endl;
-  }
-  if (var.as<Var>() && var.as<Var>()->name == "precomputed"){
-    std::cout << " vAr is named precomputed " << std::endl;
-  }
   Allocate* alloc = new Allocate;
   alloc->var = var;
   alloc->num_elements = num_elements;
diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp
index a7155438c..122805776 100644
--- a/src/ir_tags.cpp
+++ b/src/ir_tags.cpp
@@ -4,4 +4,5 @@ namespace taco {
 const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
 const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
 const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
+const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"};
 }
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 32ee42f07..a7bb87f32 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -107,7 +107,6 @@ static bool hasStores(Stmt stmt) {
 Stmt
 LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
 {
-  std::cout << "file, line, function " << __FILE__ << " " << __LINE__ << " lower\n";
   this->assemble = assemble;
   this->compute = compute;
   definedIndexVarsOrdered = {};
@@ -127,17 +126,14 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
   // Create variables for temporaries
   // TODO Remove this
   for (auto& temp : temporaries) {
-    std::cout << __FILE__ << " " << __LINE__ << " temp: " << temp << std::endl;
-    if (((TensorVar)(temp)).is_shared_memory()){
-      std::cout  << __FILE__ << " " << __LINE__ << "TENSOR VAR IS SHARED MEMORY" << std::endl;
+    if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){
       ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
-                                   true, true, true);
+                                   true, true, ((TensorVar)(temp)).getGPUWorkspace());
         tensorVars.insert({temp, irVar});
     }
     else{
       ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
                                    true, true);
-        std::cout  << __FILE__ << " " << __LINE__ << "TENSOR VAR IS **NOT** SHARED MEMORY" << std::endl;
         tensorVars.insert({temp, irVar});
     }
     
@@ -1292,16 +1288,13 @@ Stmt LowererImpl::lowerWhere(Where where) {
   else {
     if (generateComputeCode()) {
       Expr values;
-      if (temporary.is_shared_memory()){
-        std::cout << "~~~~~~ at lower where " << temporary.getName() << " is marked as shared. " << std::endl;
+      if (temporary.getGPUWorkspace() != GPUWorkspace::None){
         values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
-                                  true, false, true);
+                                  true, false, temporary.getGPUWorkspace());
       }
       else
       {
-        std::cout << "~~~~~~ at lower where " << temporary.getName() << " is **NOT** marked as shared. " << std::endl;
-
         values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
                                   true, false);
@@ -1312,13 +1305,7 @@ Stmt LowererImpl::lowerWhere(Where where) {
       Dimension temporarySize = temporary.getType().getShape().getDimension(0);
       Expr size;
       if (temporarySize.isFixed()) {
-        // if (temporary.is_shared_memory()){
           size = ir::Literal::make(temporarySize.getSize());
-        // }
-        // else{
-          // size = ir::Literal::make(temporarySize.getSize());
-        // }
-        // size = ir::Literal::make(temporarySize.getSize());
       }
       else if (temporarySize.isIndexVarSized()) {
         IndexVar var = temporarySize.getIndexVarSize();
@@ -1330,7 +1317,6 @@ Stmt LowererImpl::lowerWhere(Where where) {
       }
 
       // no decl needed for shared memory
-      // ????
       Stmt decl = Stmt();
       if((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
         decl = VarDecl::make(values, ir::Literal::make(0));
@@ -1340,8 +1326,8 @@ Stmt LowererImpl::lowerWhere(Where where) {
       Expr p = Var::make("p" + temporary.getName(), Int());
       Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
 
-      // if shared mem, do something
-
+      // TODO: Should this zero init loop even exist for precompute for dense shared
+      // memory workspaces on GPUs? 
       Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
 
       freeTemporary = Free::make(values);
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 0741e6542..de7e648f7 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -129,19 +129,18 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precompute
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
-IndexStmt scheduleSpMVGPU_wsp(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) {
+IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) {
   int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE;
   int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE;
   IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre");
-  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, true);
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory);
   return stmt.fuse(i, j, f)
           .pos(f, fpos, A(i, j))
           .split(fpos, block, fpos1, NNZ_PER_TB)
           .split(fpos1, warp, fpos2, NNZ_PER_WARP)
           .split(fpos2, thread, thread_nz, NNZ_PER_THREAD)
           .reorder({block, warp, thread, thread_nz})
-          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, true)
-          // .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory)
           .unroll(thread_nz_pre, NNZ_PER_THREAD)
           .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
           .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
@@ -207,27 +206,6 @@ IndexStmt scheduleSpMMGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precompute
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
-IndexStmt scheduleSpMMGPU_wsp(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_WARP=8, int BLOCK_SIZE=256) {
-  int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
-  IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), nnz_pre("nnz_pre");
-  IndexVar dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread");
-  IndexVar thread_nz("thread_nz");
-  TensorVar precomputed("precomputed", Type(Float64, {Dimension(nnz)}), taco::dense);
-  return stmt.reorder({i, j, k})
-          .fuse(i, j, f)
-          .pos(f, fpos, A(i, j))
-          .split(fpos, block, fpos1, NNZ_PER_TB)
-          .split(fpos1, warp, nnz, NNZ_PER_WARP)
-          .split(k, dense_val_unbounded, thread, WARP_SIZE)
-          .reorder({block, warp, thread, dense_val_unbounded, nnz})
-          //.precompute(precomputedExpr, nnz, nnz, precomputed)
-          .bound(dense_val_unbounded, dense_val, 4, BoundType::MaxExact)
-          //.unroll(dense_val, 4)
-          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
-          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
-          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
-}
-
 IndexStmt scheduleSDDMMGPU(IndexStmt stmt, Tensor<double> B, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256, int CO_FACTOR=4) {
   int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
   IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz");
@@ -285,6 +263,25 @@ IndexStmt scheduleTTVGPU(IndexStmt stmt, Tensor<double> B, IndexExpr precomputed
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
+IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor<double> B, IndexExpr precomputedExpr, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256) {
+  int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
+  IndexVar jk("jk"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), fpos2("fpos2"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre");
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory);
+
+  return stmt.fuse(j, k, jk)
+          .fuse(i, jk, f)
+          .pos(f, fpos, B(i,j,k))
+          .split(fpos, block, fpos1, NNZ_PER_TB)
+          .split(fpos1, warp, fpos2, NNZ_PER_WARP)
+          .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE)
+          .reorder({block, warp, thread, thread_nz})
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory)
+          .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE)
+          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
+}
+
 IndexStmt scheduleMTTKRPGPU(IndexStmt stmt, Tensor<double> B, int NNZ_PER_WARP=16, int BLOCK_SIZE=256) {
   int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
   IndexVar kl("kl"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread");
@@ -837,9 +834,8 @@ TEST(scheduling_eval, spmvGPU) {
   ASSERT_TENSOR_EQ(expected, y);
 }
 
-TEST(scheduling_eval, smpreGPU) {
+TEST(scheduling_eval, spmvGPU_dsm) {
   if (!should_use_CUDA_codegen()) {
-    std::cout << "not using cuda" << std::endl;
     return;
   }
   int NUM_I = 425;
@@ -868,24 +864,18 @@ TEST(scheduling_eval, smpreGPU) {
   A.pack();
   IndexExpr precomputed = A(i, j) * x(j);
   y(i) = precomputed;
-  // std::cout << y(i) << std::endl;
 
   IndexStmt stmt = y.getAssignment().concretize();
-  // stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed);
-  stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed);
-  std::cout << "stmt:" << std::endl;
-  std::cout << stmt << std::endl;
-
+  stmt = scheduleSpMVGPU_dsm(stmt, A, precomputed);
   //printToFile("spmv_gpu", stmt);
 
-  // y.compile(stmt);
-  // Just overload th compile 
 
   y.compile(stmt);
   y.assemble();
   y.compute();
 
-  // y.recompile("/tmp/taco_tmp_Ryt3Xk/vdhf888td940");
+  // Example of using "recompile" to debug
+  // y.recompile("/tmp/taco_tmp_88888/xxxxxxx");
   // y.reassemble();
   // y.recompute();
 
@@ -897,64 +887,6 @@ TEST(scheduling_eval, smpreGPU) {
   ASSERT_TENSOR_EQ(expected, y);
 }
 
-TEST(scheduling_eval, spmmGPU_wsp) {
-
- if (!should_use_CUDA_codegen()) {
-    std::cout << "not using cuda" << std::endl;
-    return;
-  }
-  int NUM_I = 1021/10;
-  int NUM_J = 1039/10;
-  int NUM_K = 128;
-  float SPARSITY = .3;
-  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
-  Tensor<double> B("B", {NUM_J, NUM_K}, {Dense, Dense});
-  Tensor<double> C("C", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}}));
-
-  srand(4343211);
-  for (int i = 0; i < NUM_I; i++) {
-    for (int j = 0; j < NUM_J; j++) {
-      float rand_float = (float)rand()/(float)(RAND_MAX);
-      if (rand_float < SPARSITY) {
-        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
-      }
-    }
-  }
-
-  for (int j = 0; j < NUM_J; j++) {
-    for (int k = 0; k < NUM_K; k++) {
-      float rand_float = (float)rand()/(float)(RAND_MAX);
-      B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
-    }
-  }
-
-  A.pack();
-  B.pack();
-  IndexExpr precomputed = A(i, j);
-  C(i, k) = B(j, k) * precomputed;
-
-  IndexStmt stmt = C.getAssignment().concretize();
-  stmt = scheduleSpMMGPU_wsp(stmt, A, precomputed);
-
-  //printToFile("spmm_gpu", stmt);
-
-  C.compile(stmt);
-  C.assemble();
-  C.compute();
-
-  // C.recompile("/tmp/taco_tmp_HT60Vh/vdhf888td940");
-  // C.reassemble();
-  // C.recompute();
-
-  Tensor<double> expected("expected", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}}));
-  expected(i, k) = A(i, j) * B(j, k);
-  expected.compile();
-  expected.assemble();
-  expected.compute();
-  ASSERT_TENSOR_EQ(expected, C);
-}
-
-
 TEST(scheduling_eval, spmmGPU) {
   if (!should_use_CUDA_codegen()) {
     return;
@@ -1223,6 +1155,58 @@ TEST(scheduling_eval, ttvGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+TEST(scheduling_eval, ttvGPU_dsm) {
+  if (!should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, {Dense});
+
+  srand(353252);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  IndexExpr precomputedExpr = B(i,j,k) * c(k);
+  A(i,j) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVGPU_dsm(stmt, B, precomputedExpr);
+
+  //printToFile("ttv_gpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j,k) * c(k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
 TEST(scheduling_eval, mttkrpGPU) {
   if (!should_use_CUDA_codegen()) {
     return;

From fc9c191c02286329bcba63e92b60b8cec8795229 Mon Sep 17 00:00:00 2001
From: Amalee Wilson <amalee@cs.stanford.edu>
Date: Sun, 21 Jun 2020 23:07:42 -0700
Subject: [PATCH 3/4] Remove redundant GPU workspace specification

---
 include/taco/index_notation/index_notation.h | 2 +-
 src/index_notation/index_notation.cpp        | 4 ++--
 test/tests-scheduling-eval.cpp               | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index 096ac6756..f4a42edc3 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -622,7 +622,7 @@ class IndexStmt : public util::IntrusivePtr<const IndexStmtNode> {
   ///  The precompute transformation is described in kjolstad2019
   ///  allows us to leverage scratchpad memories and
   ///  reorder computations to increase locality
-  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None) const;
+  IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const;
 
   /// bound specifies a compile-time constraint on an index variable's
   /// iteration space that allows knowledge of the
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 9a5baf677..7b9c10bf5 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -1377,7 +1377,7 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa
   return *this;
 }
 
-IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace) const {
+IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const {
   IndexStmt transformed = *this;
   string reason;
   if (i != iw) {
@@ -1388,7 +1388,7 @@ IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorV
     }
   }
 
-  transformed = Transformation(Precompute(expr, i, iw, workspace, gpuworkspace)).apply(transformed, &reason);
+  transformed = Transformation(Precompute(expr, i, iw, workspace)).apply(transformed, &reason);
   if (!transformed.defined()) {
     taco_uerror << reason;
   }
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index de7e648f7..3b5b94c71 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -140,7 +140,7 @@ IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor<double> A, IndexExpr precom
           .split(fpos1, warp, fpos2, NNZ_PER_WARP)
           .split(fpos2, thread, thread_nz, NNZ_PER_THREAD)
           .reorder({block, warp, thread, thread_nz})
-          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory)
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
           .unroll(thread_nz_pre, NNZ_PER_THREAD)
           .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
           .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
@@ -275,7 +275,7 @@ IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor<double> B, IndexExpr precomp
           .split(fpos1, warp, fpos2, NNZ_PER_WARP)
           .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE)
           .reorder({block, warp, thread, thread_nz})
-          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory)
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
           .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE)
           .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
           .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)

From bbff9d70284450708eed43488bd3590fe24caff7 Mon Sep 17 00:00:00 2001
From: Amalee Wilson <amalee@cs.stanford.edu>
Date: Sun, 21 Jun 2020 23:21:02 -0700
Subject: [PATCH 4/4] More cleaned up first pass for dense shared memory GPU
 workspaces

---
 include/taco/index_notation/index_notation.h   |  1 -
 include/taco/index_notation/provenance_graph.h |  1 -
 include/taco/index_notation/transformations.h  |  2 +-
 include/taco/ir/ir.h                           |  1 -
 src/codegen/codegen_cuda.cpp                   | 15 ---------------
 src/index_notation/transformations.cpp         |  4 +---
 src/lower/lowerer_impl.cpp                     |  7 +++----
 7 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index f4a42edc3..73607729a 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -849,7 +849,6 @@ class TensorVar : public util::Comparable<TensorVar> {
   TensorVar(const Type& type);
   TensorVar(const std::string& name, const Type& type);
   TensorVar(const Type& type, const Format& format);
-  // TensorVar(const std::string& name, const Type& type, const Format& format);
   TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
   // Returns the type of GPU workspace this TensorVar is, which is None by default. 
diff --git a/include/taco/index_notation/provenance_graph.h b/include/taco/index_notation/provenance_graph.h
index e5f2b195e..cc2b12d7b 100644
--- a/include/taco/index_notation/provenance_graph.h
+++ b/include/taco/index_notation/provenance_graph.h
@@ -226,7 +226,6 @@ bool operator==(const BoundRelNode&, const BoundRelNode&);
 /// This allows precomputeVar to be scheduled separately from the parentVar
 struct PrecomputeRelNode : public IndexVarRelNode {
   PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar);
-  PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar, bool shared_mem);
 
   const IndexVar& getParentVar() const;
   const IndexVar& getPrecomputeVar() const;
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index f91783e56..3dd4fe141 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -86,7 +86,7 @@ std::ostream &operator<<(std::ostream &, const Reorder &);
 class Precompute : public TransformationInterface {
 public:
   Precompute();
-  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None);
+  Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace);
 
   IndexExpr getExpr() const;
   IndexVar geti() const;
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 2befde4ed..5445f91ea 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -686,7 +686,6 @@ struct Allocate : public StmtNode<Allocate> {
   Expr num_elements;
   Expr old_elements; // used for realloc in CUDA
   bool is_realloc;
-  bool is_shared_memory;
   
   static Stmt make(Expr var, Expr num_elements, bool is_realloc=false,
                    Expr old_elements=Expr());
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index a08f89d7a..fabc0d5ca 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -1150,21 +1150,6 @@ void CodeGen_CUDA::visit(const Allocate* op) {
       return;
     }
     
-
-    
-
-    // if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
-    //   // double w_GPUThread[num];
-    //   // for threads allocate thread local memory
-    //   doIndent();
-    //   stream << elementType << " ";
-    //   op->var.accept(this);
-    //   stream << "[";
-    //   op->num_elements.accept(this);
-    //   stream << "];" << endl;
-    //   return;
-    // }
-    
   }
   string variable_name;
   if (op->is_realloc) {
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 60f2bf8a0..03ddba8b4 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -133,19 +133,17 @@ struct Precompute::Content {
   IndexVar i;
   IndexVar iw;
   TensorVar workspace;
-  GPUWorkspace gpuworkspace = GPUWorkspace::None;
 };
 
 Precompute::Precompute() : content(nullptr) {
 }
 
 Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw,
-                     TensorVar workspace, GPUWorkspace gpuworkspace) : content(new Content) {
+                     TensorVar workspace) : content(new Content) {
   content->expr = expr;
   content->i = i;
   content->iw = iw;
   content->workspace = workspace;
-  content->gpuworkspace = gpuworkspace;
 }
 
 IndexExpr Precompute::getExpr() const {
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index a7bb87f32..890aee524 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -129,12 +129,12 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
     if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){
       ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
                                    true, true, ((TensorVar)(temp)).getGPUWorkspace());
-        tensorVars.insert({temp, irVar});
+      tensorVars.insert({temp, irVar});
     }
     else{
       ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
                                    true, true);
-        tensorVars.insert({temp, irVar});
+      tensorVars.insert({temp, irVar});
     }
     
     
@@ -1300,12 +1300,11 @@ Stmt LowererImpl::lowerWhere(Where where) {
                                   true, false);
       }
       
-      
       taco_iassert(temporary.getType().getOrder() == 1); // TODO
       Dimension temporarySize = temporary.getType().getShape().getDimension(0);
       Expr size;
       if (temporarySize.isFixed()) {
-          size = ir::Literal::make(temporarySize.getSize());
+        size = ir::Literal::make(temporarySize.getSize());
       }
       else if (temporarySize.isIndexVarSized()) {
         IndexVar var = temporarySize.getIndexVarSize();