From c240b87edd402465b848c20549637ff3a3d86f53 Mon Sep 17 00:00:00 2001 From: Amalee Wilson Date: Sun, 14 Jun 2020 23:26:48 -0700 Subject: [PATCH 1/4] Added shared memory workspaces for GPUs. Functional but needs cleanup. --- include/taco/codegen/module.h | 3 + include/taco/index_notation/index_notation.h | 4 + .../taco/index_notation/provenance_graph.h | 1 + include/taco/index_notation/transformations.h | 1 + include/taco/ir/ir.h | 4 +- include/taco/tensor.h | 6 + src/codegen/codegen_cuda.cpp | 180 +++++++++++++++++- src/codegen/codegen_cuda.h | 5 + src/codegen/module.cpp | 45 +++++ src/index_notation/index_notation.cpp | 34 ++++ src/index_notation/transformations.cpp | 25 +++ src/ir/ir.cpp | 19 +- src/lower/lowerer_impl.cpp | 45 ++++- src/tensor.cpp | 26 +++ test/tests-scheduling-eval.cpp | 158 +++++++++++++++ 15 files changed, 548 insertions(+), 8 deletions(-) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 788156fdb..4b6aa6f37 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -25,6 +25,9 @@ class Module { /// Compile the source into a library, returning its full path std::string compile(); + + // Recompile + std::string recompile(std::string file_path); /// Compile the module into a source file located at the specified location /// path and prefix. The generated source will be path/prefix.{.c|.bc, .h} diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index bda31635a..04ec9c602 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -620,6 +620,7 @@ class IndexStmt : public util::IntrusivePtr { /// allows us to leverage scratchpad memories and /// reorder computations to increase locality IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const; + IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const; /// bound specifies a compile-time constraint on an index variable's /// iteration space that allows knowledge of the @@ -844,6 +845,9 @@ class TensorVar : public util::Comparable { TensorVar(const std::string& name, const Type& type); TensorVar(const Type& type, const Format& format); TensorVar(const std::string& name, const Type& type, const Format& format); + TensorVar(const std::string& name, const Type& type, const Format& format, bool is_shared_mem); + + bool is_shared_memory(); /// Returns the name of the tensor variable. std::string getName() const; diff --git a/include/taco/index_notation/provenance_graph.h b/include/taco/index_notation/provenance_graph.h index cc2b12d7b..e5f2b195e 100644 --- a/include/taco/index_notation/provenance_graph.h +++ b/include/taco/index_notation/provenance_graph.h @@ -226,6 +226,7 @@ bool operator==(const BoundRelNode&, const BoundRelNode&); /// This allows precomputeVar to be scheduled separately from the parentVar struct PrecomputeRelNode : public IndexVarRelNode { PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar); + PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar, bool shared_mem); const IndexVar& getParentVar() const; const IndexVar& getPrecomputeVar() const; diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 3dd4fe141..df29b7feb 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -87,6 +87,7 @@ class Precompute : public TransformationInterface { public: Precompute(); Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace); + Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem); IndexExpr getExpr() const; IndexVar geti() const; diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 15dbdc7aa..56f186d66 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -254,9 +254,10 @@ struct Var : public ExprNode { std::string name; bool is_ptr; bool is_tensor; + bool is_shared_memory = false; static Expr make(std::string name, Datatype type, bool is_ptr=false, - bool is_tensor=false); + bool is_tensor=false, bool is_shared_memory=false); static const IRNodeType _type_info = IRNodeType::Var; }; @@ -685,6 +686,7 @@ struct Allocate : public StmtNode { Expr num_elements; Expr old_elements; // used for realloc in CUDA bool is_realloc; + bool is_shared_memory; static Stmt make(Expr var, Expr num_elements, bool is_realloc=false, Expr old_elements=Expr()); diff --git a/include/taco/tensor.h b/include/taco/tensor.h index 05bc1773b..cab402346 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -406,12 +406,18 @@ class TensorBase { void compile(IndexStmt stmt, bool assembleWhileCompute=false); + void recompile(std::string file_path); + /// Assemble the tensor storage, including index and value arrays. void assemble(); + void reassemble(); + /// Compute the given expression and put the values in the tensor storage. void compute(); + void recompute(); + /// Compile, assemble and compute as needed. void evaluate(); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 5eb57c7ad..b7e59f691 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { vector> warpIDVars; vector numThreads; vector numWarps; + vector sizeSharedMemory; + std::string typeSharedMemory; CodeGen_CUDA *codeGen; // copy inputs and outputs into the map @@ -359,6 +361,17 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { threadFors.push_back(op); threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + // if (isa(op->var)){ + // std::cout << "~~~~~ Found a var! " << to(op->var)->name << std::endl; + // if (to(op->var)->is_shared_memory) + // { + // std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; + + // // string elementType = printCUDAType(op->var.type(), false); + // // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); + // } + // } + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); numThreads.push_back(blockSize); } @@ -378,6 +391,20 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { } virtual void visit(const Var *op) { + + if (isa(op)){ + // std::cout << "~~~~~ Found a var! " << to(op)->name << std::endl; + if (to(op)->is_shared_memory) + { + std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; + // std::cout << to(op) << std::endl; + string elementType = printCUDAType( op->type, false); + std::cout << "element type : " << elementType << std::endl; + typeSharedMemory = elementType; + // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); + } + } + if (scopeMap.count(op) == 0) { string name = codeGen->genUniqueName(op->name); if (!inDeviceFunction) { @@ -395,6 +422,20 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { } virtual void visit(const VarDecl *op) { + + // if (isa(op->var)){ + // std::cout << "@@@@ Found a var decl! " << to(op->var)->name << std::endl; + // if (to(op->var)->is_shared_memory) + // { + // std::cout << "@@@@@@@ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; + // // std::cout << to(op) << std::endl; + // // string elementType = printCUDAType( op->type, false); + // // std::cout << "element type : " << elementType << std::endl; + // // typeSharedMemory = elementType; + // // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); + // } + // } + if (inDeviceFunction) { variablesDeclaredInKernel.insert(op->var); } @@ -580,7 +621,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP gridSize.accept(this); stream << ", "; blockSize.accept(this); - stream << ">>>"; + + if (usesshared == false) + { + stream << ">>>"; + } + else + { + // BIG TODO: no hard code 2048 + // Should be num_threads * num_prec_elems * sizeof(prec_type) + stream << ", " ; + sizeofshared.accept(this); + stream << " * sizeof(" << typeofshared << ")>>>"; + // 2048*sizeof(double)>>>"; + } + + + stream << "("; string delimiter = ""; @@ -627,6 +684,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) { parentParallelUnits = {}; parallelUnitSizes = {}; parallelUnitIDVars = {}; + sizeofshared = Expr(); + typeofshared = ""; + usesshared = false; emittedTimerStartCode = false; isHostFunction = true; if (isFirst) { @@ -1021,8 +1081,107 @@ void CodeGen_CUDA::visit(const Max* op) { void CodeGen_CUDA::visit(const Allocate* op) { string elementType = printCUDAType(op->var.type(), false); + + // std::cout << op->var << std::endl; + if (isa(op->var)){ + if (to(op->var)->name == "precomputed") + { + std::cout << " found precomputed!! " << std::endl; + // std::cout << "var : " << to(op->var) << std::endl; + // std::cout << "*var : " << *(to(op->var)) << std::endl; + // stream << "__shared__" << " "; + } + if (to(op->var)->is_shared_memory) + { + std::cout << "HEY IT IS MARKED AS SHARED MEMORY" << std::endl; + // doIndent(); + // stream << "__shared__ "; + } + else + { + std::cout << "NOT marked as shared memory" << std::endl; + } + + // __syncthreads() ??; + } + if (!isHostFunction) { - if (parentParallelUnits.count(ParallelUnit::GPUThread)) { + + if (to(op->var)->is_shared_memory) + { + std::cout << "AT visit allocate, " << to(op->var)->name << " is shared mem" << std::endl; +// __shared__ double w_GPUThread[32]; if no warps + // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps + // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; + + // Example.. todo: figure this out and put it back + taco_iassert(!op->is_realloc); + doIndent(); + stream << "__shared__ " << elementType << " "; + op->var.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + stream << "_ALL"; + } + stream << "["; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + // Expr numElements = Mul::make(op->num_elements, + // Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], + // parallelUnitSizes[ParallelUnit::GPUWarp])); + // Expr numElements = Mul::make(op->num_elements, + // parallelUnitSizes[ParallelUnit::GPUWarp]); + Expr numElements = Mul::make(op->num_elements, + parallelUnitSizes[ParallelUnit::GPUBlock]); + // BIG TODO: remove hard coded 8 + // std::cout << "######warp " << parallelUnitSizes[ParallelUnit::GPUWarp] << std::endl; + // std::cout << "######block " << parallelUnitSizes[ParallelUnit::GPUBlock] << std::endl; + // std::cout << "######threads " << parallelUnitSizes[ParallelUnit::GPUThread] << std::endl; + // std::cout << "######warpred " << parallelUnitSizes[ParallelUnit::GPUWarpReduction] << std::endl; + // std::cout << "######blockred " << parallelUnitSizes[ParallelUnit::GPUBlockReduction] << std::endl; + + // std::pair pr = ; + sizeofshared = numElements; + typeofshared = elementType; + usesshared = true; + // sharedMemTypes.push_back(std::make_pair(op->var, elementType)); + // std::cout << "var type from map: " << sharedMemTypes[0].second << std::endl; + // sharedMemSizes.push_back(std::make_pair(op->var, numElements)); + // std::cout << "var numElem from map: " << sharedMemSizes[0].second << std::endl; + + + + + ir::simplify(numElements).accept(this); + } + else { + std::cout << "AT visit allocate, " << to(op->var)->name << " is *** NOT*** shared mem" << std::endl; + doIndent(); + stream << elementType << " "; + op->var.accept(this); + stream << "["; + op->num_elements.accept(this); + stream << "];" << endl; + return; + } + stream << "];" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + doIndent(); + stream << elementType << " * "; + op->var.accept(this); + + stream << " = "; + op->var.accept(this); + stream << "_ALL + threadIdx.x"; + // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); + stream << " * "; + op->num_elements.accept(this); + // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); + stream << ";" << endl; + } + return; + } + else + { + if (parentParallelUnits.count(ParallelUnit::GPUThread)) { // double w_GPUThread[num]; // for threads allocate thread local memory doIndent(); @@ -1066,6 +1225,23 @@ void CodeGen_CUDA::visit(const Allocate* op) { stream << ";" << endl; } return; + } + + + + + // if (parentParallelUnits.count(ParallelUnit::GPUThread)) { + // // double w_GPUThread[num]; + // // for threads allocate thread local memory + // doIndent(); + // stream << elementType << " "; + // op->var.accept(this); + // stream << "["; + // op->num_elements.accept(this); + // stream << "];" << endl; + // return; + // } + } string variable_name; if (op->is_realloc) { diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 2bc8e000d..654344b39 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen { std::map parallelUnitSizes; std::map parallelUnitIDVars; + + Expr sizeofshared; + std::string typeofshared; + bool usesshared; + bool emittedTimerStartCode = false; std::ostream &out; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index fc52c409d..6c84bc56d 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -164,6 +164,51 @@ string Module::compile() { return fullpath; } +string Module::recompile(string file_path) { + string prefix = file_path; + string fullpath = prefix + ".so"; + + string cc; + string cflags; + string file_ending; + string shims_file; + if (should_use_CUDA_codegen()) { + cc = "nvcc"; + cflags = util::getFromEnv("TACO_NVCCFLAGS", + get_default_CUDA_compiler_flags()); + file_ending = ".cu"; + shims_file = prefix + "_shims.cpp"; + } + else { + cc = util::getFromEnv(target.compiler_env, target.compiler); + cflags = util::getFromEnv("TACO_CFLAGS", + "-O3 -ffast-math -std=c99") + " -shared -fPIC"; + file_ending = ".c"; + shims_file = ""; + } +#if USE_OPENMP + cflags += " -fopenmp"; +#endif + + string cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + shims_file + " " + + "-o " + fullpath + " -lm"; + + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + // use dlsym() to open the compiled library + if (lib_handle) { + dlclose(lib_handle); + } + lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL); + taco_uassert(lib_handle) << "Failed to load generated code"; + + return fullpath; +} + void Module::setSource(string source) { this->source << source; moduleFromUserSource = true; diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 7035fc034..c199e2389 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -1035,6 +1035,27 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa return *this; } +IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const { + if (shared_mem){ + std::cout << "is shared_mem " << __FILE__ << " " << __LINE__ << std::endl; + } + IndexStmt transformed = *this; + string reason; + if (i != iw) { + IndexVarRel rel = IndexVarRel(new PrecomputeRelNode(i, iw)); + transformed = Transformation(AddSuchThatPredicates({rel})).apply(transformed, &reason); + if (!transformed.defined()) { + taco_uerror << reason; + } + } + + transformed = Transformation(Precompute(expr, i, iw, workspace, shared_mem)).apply(transformed, &reason); + if (!transformed.defined()) { + taco_uerror << reason; + } + return transformed; +} + IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const { IndexStmt transformed = *this; string reason; @@ -1486,6 +1507,7 @@ struct TensorVar::Content { Type type; Format format; Schedule schedule; + bool shared_memory = false; }; TensorVar::TensorVar() : content(nullptr) { @@ -1514,6 +1536,18 @@ TensorVar::TensorVar(const string& name, const Type& type, const Format& format) content->format = format; } +TensorVar::TensorVar(const string& name, const Type& type, const Format& format, bool is_shared_mem) + : content(new Content) { + content->name = name; + content->type = type; + content->format = format; + content->shared_memory = is_shared_mem; +} + +bool TensorVar::is_shared_memory() { + return content->shared_memory; +} + std::string TensorVar::getName() const { return content->name; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 752416955..f1a1eae00 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -133,6 +133,7 @@ struct Precompute::Content { IndexVar i; IndexVar iw; TensorVar workspace; + bool is_shared_mem = false; }; Precompute::Precompute() : content(nullptr) { @@ -144,6 +145,16 @@ Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw, content->i = i; content->iw = iw; content->workspace = workspace; + content->is_shared_mem = false; +} + +Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw, + TensorVar workspace, bool shared_mem) : content(new Content) { + content->expr = expr; + content->i = i; + content->iw = iw; + content->workspace = workspace; + content->is_shared_mem = shared_mem; } IndexExpr Precompute::getExpr() const { @@ -235,11 +246,25 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexStmt s = foralli.getStmt(); TensorVar ws = precompute.getWorkspace(); IndexExpr e = precompute.getExpr(); + std::cout << __FILE__ << ": " << __LINE__ << "precompute getExpr e = " << e << std::endl; + + if (ws.is_shared_memory()){ + std::cout << __FILE__ << __LINE__ << " workspace is shared memory! " << std::endl; + } + IndexVar iw = precompute.getiw(); + std::cout << "precompute getiw iw = " << iw << std::endl; + std::cout << "precompute ws = " << ws << std::endl; + std::cout << "precompute s = " << s << std::endl; IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); + std::cout << "precompute consumer = " << consumer << std::endl; + IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}})); + std::cout << "precompute producer = " << producer << std::endl; + Where where(consumer, producer); + std::cout << "precompute where = " << where << std::endl; stmt = where; return; diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp index dbe941fe6..7656d9b26 100644 --- a/src/ir/ir.cpp +++ b/src/ir/ir.cpp @@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const { return false; } -Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) { +Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, bool is_shared_mem) { Var *var = new Var; var->type = type; var->name = name; @@ -249,6 +249,12 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) { // TODO: is_ptr and is_tensor should be part of type var->is_ptr = is_ptr; var->is_tensor = is_tensor; + var->is_shared_memory = is_shared_mem; + + if (var->is_shared_memory) + { + std::cout << var->name << " is shared memory" << __FILE__ << __LINE__ << std::endl; + } return var; } @@ -644,6 +650,10 @@ Stmt For::make(Expr var, Expr start, Expr end, Expr increment, Stmt body, LoopKind kind, ParallelUnit parallel_unit, size_t unrollFactor, int vec_width) { For *loop = new For; loop->var = var; + if (var.as() && var.as()->name == "pprecomputed"){ + std::cout << " vAr is named Pprecomputed " << std::endl; + } + loop->start = start; loop->end = end; loop->increment = increment; @@ -753,6 +763,13 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme "Can only allocate memory for a pointer-typed Var"; taco_iassert(num_elements.type().isInt() || num_elements.type().isUInt()) << "Can only allocate an integer-valued number of elements"; + + if (var.as() && var.as()->is_shared_memory){ + std::cout << " vAr is Marked as shaRed memory, nice! " << std::endl; + } + if (var.as() && var.as()->name == "precomputed"){ + std::cout << " vAr is named precomputed " << std::endl; + } Allocate* alloc = new Allocate; alloc->var = var; alloc->num_elements = num_elements; diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index acc1b11bf..32ee42f07 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -107,6 +107,7 @@ static bool hasStores(Stmt stmt) { Stmt LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) { + std::cout << "file, line, function " << __FILE__ << " " << __LINE__ << " lower\n"; this->assemble = assemble; this->compute = compute; definedIndexVarsOrdered = {}; @@ -126,9 +127,21 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) // Create variables for temporaries // TODO Remove this for (auto& temp : temporaries) { - ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), + std::cout << __FILE__ << " " << __LINE__ << " temp: " << temp << std::endl; + if (((TensorVar)(temp)).is_shared_memory()){ + std::cout << __FILE__ << " " << __LINE__ << "TENSOR VAR IS SHARED MEMORY" << std::endl; + ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), + true, true, true); + tensorVars.insert({temp, irVar}); + } + else{ + ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), true, true); - tensorVars.insert({temp, irVar}); + std::cout << __FILE__ << " " << __LINE__ << "TENSOR VAR IS **NOT** SHARED MEMORY" << std::endl; + tensorVars.insert({temp, irVar}); + } + + } // Create variables for keeping track of result values array capacity @@ -1278,14 +1291,34 @@ Stmt LowererImpl::lowerWhere(Where where) { } else { if (generateComputeCode()) { - Expr values = ir::Var::make(temporary.getName(), + Expr values; + if (temporary.is_shared_memory()){ + std::cout << "~~~~~~ at lower where " << temporary.getName() << " is marked as shared. " << std::endl; + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false, true); + } + else + { + std::cout << "~~~~~~ at lower where " << temporary.getName() << " is **NOT** marked as shared. " << std::endl; + + values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); + } + + taco_iassert(temporary.getType().getOrder() == 1); // TODO Dimension temporarySize = temporary.getType().getShape().getDimension(0); Expr size; if (temporarySize.isFixed()) { - size = ir::Literal::make(temporarySize.getSize()); + // if (temporary.is_shared_memory()){ + size = ir::Literal::make(temporarySize.getSize()); + // } + // else{ + // size = ir::Literal::make(temporarySize.getSize()); + // } + // size = ir::Literal::make(temporarySize.getSize()); } else if (temporarySize.isIndexVarSized()) { IndexVar var = temporarySize.getIndexVarSize(); @@ -1297,6 +1330,7 @@ Stmt LowererImpl::lowerWhere(Where where) { } // no decl needed for shared memory + // ???? Stmt decl = Stmt(); if((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); @@ -1305,6 +1339,9 @@ Stmt LowererImpl::lowerWhere(Where where) { Expr p = Var::make("p" + temporary.getName(), Int()); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + + // if shared mem, do something + Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); freeTemporary = Free::make(values); diff --git a/src/tensor.cpp b/src/tensor.cpp index 5efba93cc..7787d54eb 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -408,6 +408,10 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { content->module->compile(); } +void TensorBase::recompile(std::string file_path) { + content->module->recompile(file_path); +} + taco_tensor_t* TensorBase::getTacoTensorT() { return getStorage(); } @@ -473,6 +477,17 @@ void TensorBase::assemble() { } } +void TensorBase::reassemble() { + + auto arguments = packArguments(*this); + content->module->callFuncPacked("assemble", arguments.data()); + + if (!content->assembleWhileCompute) { + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::compute() { taco_uassert(this->content->computeFunc.defined()) << error::compute_without_compile; @@ -486,6 +501,17 @@ void TensorBase::compute() { } } +void TensorBase::recompute() { + + auto arguments = packArguments(*this); + this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::evaluate() { this->compile(); if (!getAssignment().getOperator().defined()) { diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 5be64a4ff..0741e6542 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -129,6 +129,25 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor A, IndexExpr precompute .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } +IndexStmt scheduleSpMVGPU_wsp(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) { + int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE; + int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE; + IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre"); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, true); + return stmt.fuse(i, j, f) + .pos(f, fpos, A(i, j)) + .split(fpos, block, fpos1, NNZ_PER_TB) + .split(fpos1, warp, fpos2, NNZ_PER_WARP) + .split(fpos2, thread, thread_nz, NNZ_PER_THREAD) + .reorder({block, warp, thread, thread_nz}) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, true) + // .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) + .unroll(thread_nz_pre, NNZ_PER_THREAD) + .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) + .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) + .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); +} + IndexStmt scheduleSpMVRowsGPU(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int ROWS_PER_WARP=1, int BLOCK_SIZE=256) { int ROWS_PER_TB = ROWS_PER_WARP * BLOCK_SIZE; IndexVar block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), i1("i1"), jpos("jpos"), block_row("block_row"), warp_row("warp_row"); @@ -188,6 +207,27 @@ IndexStmt scheduleSpMMGPU(IndexStmt stmt, Tensor A, IndexExpr precompute .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } +IndexStmt scheduleSpMMGPU_wsp(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_WARP=8, int BLOCK_SIZE=256) { + int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); + IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), nnz_pre("nnz_pre"); + IndexVar dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread"); + IndexVar thread_nz("thread_nz"); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(nnz)}), taco::dense); + return stmt.reorder({i, j, k}) + .fuse(i, j, f) + .pos(f, fpos, A(i, j)) + .split(fpos, block, fpos1, NNZ_PER_TB) + .split(fpos1, warp, nnz, NNZ_PER_WARP) + .split(k, dense_val_unbounded, thread, WARP_SIZE) + .reorder({block, warp, thread, dense_val_unbounded, nnz}) + //.precompute(precomputedExpr, nnz, nnz, precomputed) + .bound(dense_val_unbounded, dense_val, 4, BoundType::MaxExact) + //.unroll(dense_val, 4) + .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) + .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) + .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); +} + IndexStmt scheduleSDDMMGPU(IndexStmt stmt, Tensor B, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256, int CO_FACTOR=4) { int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"); @@ -797,6 +837,124 @@ TEST(scheduling_eval, spmvGPU) { ASSERT_TENSOR_EQ(expected, y); } +TEST(scheduling_eval, smpreGPU) { + if (!should_use_CUDA_codegen()) { + std::cout << "not using cuda" << std::endl; + return; + } + int NUM_I = 425; + int NUM_J = 425; + float SPARSITY = .19; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + + srand(94353); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double)rand_float); + } + + x.pack(); + A.pack(); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + // std::cout << y(i) << std::endl; + + IndexStmt stmt = y.getAssignment().concretize(); + // stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed); + stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed); + std::cout << "stmt:" << std::endl; + std::cout << stmt << std::endl; + + //printToFile("spmv_gpu", stmt); + + // y.compile(stmt); + // Just overload th compile + + y.compile(stmt); + y.assemble(); + y.compute(); + + // y.recompile("/tmp/taco_tmp_Ryt3Xk/vdhf888td940"); + // y.reassemble(); + // y.recompute(); + + Tensor expected("expected", {NUM_I}, {Dense}); + expected(i) = A(i, j) * x(j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, y); +} + +TEST(scheduling_eval, spmmGPU_wsp) { + + if (!should_use_CUDA_codegen()) { + std::cout << "not using cuda" << std::endl; + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}})); + + srand(4343211); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + IndexExpr precomputed = A(i, j); + C(i, k) = B(j, k) * precomputed; + + IndexStmt stmt = C.getAssignment().concretize(); + stmt = scheduleSpMMGPU_wsp(stmt, A, precomputed); + + //printToFile("spmm_gpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + // C.recompile("/tmp/taco_tmp_HT60Vh/vdhf888td940"); + // C.reassemble(); + // C.recompute(); + + Tensor expected("expected", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}})); + expected(i, k) = A(i, j) * B(j, k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); +} + + TEST(scheduling_eval, spmmGPU) { if (!should_use_CUDA_codegen()) { return; From 83169e12f3dfefd82ba921381af78401d126ebfa Mon Sep 17 00:00:00 2001 From: Amalee Wilson Date: Sun, 21 Jun 2020 22:46:26 -0700 Subject: [PATCH 2/4] First pass to add support for dense shared memory workspaces for GPUs --- include/taco/codegen/module.h | 5 +- include/taco/index_notation/index_notation.h | 10 +- include/taco/index_notation/transformations.h | 3 +- include/taco/ir/ir.h | 4 +- include/taco/ir_tags.h | 6 + src/codegen/codegen_cuda.cpp | 233 ++++++------------ src/index_notation/index_notation.cpp | 42 +--- src/index_notation/transformations.cpp | 28 +-- src/ir/ir.cpp | 20 +- src/ir_tags.cpp | 1 + src/lower/lowerer_impl.cpp | 26 +- test/tests-scheduling-eval.cpp | 172 ++++++------- 12 files changed, 193 insertions(+), 357 deletions(-) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 4b6aa6f37..3c26cc9f1 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -26,7 +26,10 @@ class Module { /// Compile the source into a library, returning its full path std::string compile(); - // Recompile + /// Recompile. This is a debugging tool that, given the path to the temporary + /// file generated by taco, will compile the file. This function is useful + /// for development and facilitates experimentation with generated code by + /// allowing developers to modify the generated code and compile it again. std::string recompile(std::string file_path); /// Compile the module into a source file located at the specified location diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index 153f093f2..096ac6756 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -622,8 +622,7 @@ class IndexStmt : public util::IntrusivePtr { /// The precompute transformation is described in kjolstad2019 /// allows us to leverage scratchpad memories and /// reorder computations to increase locality - IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const; - IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const; + IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None) const; /// bound specifies a compile-time constraint on an index variable's /// iteration space that allows knowledge of the @@ -850,10 +849,11 @@ class TensorVar : public util::Comparable { TensorVar(const Type& type); TensorVar(const std::string& name, const Type& type); TensorVar(const Type& type, const Format& format); - TensorVar(const std::string& name, const Type& type, const Format& format); - TensorVar(const std::string& name, const Type& type, const Format& format, bool is_shared_mem); + // TensorVar(const std::string& name, const Type& type, const Format& format); + TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None); - bool is_shared_memory(); + // Returns the type of GPU workspace this TensorVar is, which is None by default. + GPUWorkspace getGPUWorkspace(); /// Returns the name of the tensor variable. std::string getName() const; diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index df29b7feb..f91783e56 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -86,8 +86,7 @@ std::ostream &operator<<(std::ostream &, const Reorder &); class Precompute : public TransformationInterface { public: Precompute(); - Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace); - Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem); + Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None); IndexExpr getExpr() const; IndexVar geti() const; diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 56f186d66..2befde4ed 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -254,10 +254,10 @@ struct Var : public ExprNode { std::string name; bool is_ptr; bool is_tensor; - bool is_shared_memory = false; + GPUWorkspace gpuworkspace; static Expr make(std::string name, Datatype type, bool is_ptr=false, - bool is_tensor=false, bool is_shared_memory=false); + bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None); static const IRNodeType _type_info = IRNodeType::Var; }; diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 2b1a4a4f1..39ed25eb4 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -29,4 +29,10 @@ enum class BoundType { extern const char *BoundType_NAMES[]; } +/* TODO: Not sure if this is the right place for these. */ +enum class GPUWorkspace { + None, DenseSharedMemory +}; +extern const char *GPUWorkspace_NAMES[]; + #endif //TACO_IR_TAGS_H diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index b7e59f691..a08f89d7a 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -361,16 +361,6 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { threadFors.push_back(op); threadIDVars.push_back(pair(scopeMap[op->var], op->var)); - // if (isa(op->var)){ - // std::cout << "~~~~~ Found a var! " << to(op->var)->name << std::endl; - // if (to(op->var)->is_shared_memory) - // { - // std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; - - // // string elementType = printCUDAType(op->var.type(), false); - // // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); - // } - // } Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); numThreads.push_back(blockSize); @@ -393,13 +383,9 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { virtual void visit(const Var *op) { if (isa(op)){ - // std::cout << "~~~~~ Found a var! " << to(op)->name << std::endl; - if (to(op)->is_shared_memory) + if (to(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory) { - std::cout << "~~~~~ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; - // std::cout << to(op) << std::endl; string elementType = printCUDAType( op->type, false); - std::cout << "element type : " << elementType << std::endl; typeSharedMemory = elementType; // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); } @@ -422,20 +408,6 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { } virtual void visit(const VarDecl *op) { - - // if (isa(op->var)){ - // std::cout << "@@@@ Found a var decl! " << to(op->var)->name << std::endl; - // if (to(op->var)->is_shared_memory) - // { - // std::cout << "@@@@@@@ THIS VAR IS MARKED AS SHARED MEMORY" << std::endl; - // // std::cout << to(op) << std::endl; - // // string elementType = printCUDAType( op->type, false); - // // std::cout << "element type : " << elementType << std::endl; - // // typeSharedMemory = elementType; - // // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); - // } - // } - if (inDeviceFunction) { variablesDeclaredInKernel.insert(op->var); } @@ -1082,150 +1054,101 @@ void CodeGen_CUDA::visit(const Max* op) { void CodeGen_CUDA::visit(const Allocate* op) { string elementType = printCUDAType(op->var.type(), false); - // std::cout << op->var << std::endl; - if (isa(op->var)){ - if (to(op->var)->name == "precomputed") - { - std::cout << " found precomputed!! " << std::endl; - // std::cout << "var : " << to(op->var) << std::endl; - // std::cout << "*var : " << *(to(op->var)) << std::endl; - // stream << "__shared__" << " "; - } - if (to(op->var)->is_shared_memory) - { - std::cout << "HEY IT IS MARKED AS SHARED MEMORY" << std::endl; - // doIndent(); - // stream << "__shared__ "; - } - else - { - std::cout << "NOT marked as shared memory" << std::endl; - } - - // __syncthreads() ??; - } - if (!isHostFunction) { - if (to(op->var)->is_shared_memory) + if (to(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory) { - std::cout << "AT visit allocate, " << to(op->var)->name << " is shared mem" << std::endl; -// __shared__ double w_GPUThread[32]; if no warps - // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps - // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; - - // Example.. todo: figure this out and put it back - taco_iassert(!op->is_realloc); - doIndent(); - stream << "__shared__ " << elementType << " "; - op->var.accept(this); - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - stream << "_ALL"; - } - stream << "["; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - // Expr numElements = Mul::make(op->num_elements, - // Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], - // parallelUnitSizes[ParallelUnit::GPUWarp])); - // Expr numElements = Mul::make(op->num_elements, - // parallelUnitSizes[ParallelUnit::GPUWarp]); - Expr numElements = Mul::make(op->num_elements, - parallelUnitSizes[ParallelUnit::GPUBlock]); - // BIG TODO: remove hard coded 8 - // std::cout << "######warp " << parallelUnitSizes[ParallelUnit::GPUWarp] << std::endl; - // std::cout << "######block " << parallelUnitSizes[ParallelUnit::GPUBlock] << std::endl; - // std::cout << "######threads " << parallelUnitSizes[ParallelUnit::GPUThread] << std::endl; - // std::cout << "######warpred " << parallelUnitSizes[ParallelUnit::GPUWarpReduction] << std::endl; - // std::cout << "######blockred " << parallelUnitSizes[ParallelUnit::GPUBlockReduction] << std::endl; - - // std::pair pr = ; - sizeofshared = numElements; - typeofshared = elementType; - usesshared = true; - // sharedMemTypes.push_back(std::make_pair(op->var, elementType)); - // std::cout << "var type from map: " << sharedMemTypes[0].second << std::endl; - // sharedMemSizes.push_back(std::make_pair(op->var, numElements)); - // std::cout << "var numElem from map: " << sharedMemSizes[0].second << std::endl; - - - - - ir::simplify(numElements).accept(this); - } - else { - std::cout << "AT visit allocate, " << to(op->var)->name << " is *** NOT*** shared mem" << std::endl; + taco_iassert(!op->is_realloc); doIndent(); - stream << elementType << " "; + stream << "__shared__ " << elementType << " "; op->var.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + stream << "_ALL"; + } stream << "["; - op->num_elements.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + Expr numElements = Mul::make(op->num_elements, + parallelUnitSizes[ParallelUnit::GPUBlock]); + + sizeofshared = numElements; + typeofshared = elementType; + usesshared = true; + + ir::simplify(numElements).accept(this); + } + else { + doIndent(); + stream << elementType << " "; + op->var.accept(this); + stream << "["; + op->num_elements.accept(this); + stream << "];" << endl; + return; + } stream << "];" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + doIndent(); + stream << elementType << " * "; + op->var.accept(this); + + stream << " = "; + op->var.accept(this); + stream << "_ALL + threadIdx.x"; + // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); + stream << " * "; + op->num_elements.accept(this); + // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); + stream << ";" << endl; + } return; } - stream << "];" << endl; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - doIndent(); - stream << elementType << " * "; - op->var.accept(this); - - stream << " = "; - op->var.accept(this); - stream << "_ALL + threadIdx.x"; - // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); - stream << " * "; - op->num_elements.accept(this); - // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); - stream << ";" << endl; - } - return; - } else { if (parentParallelUnits.count(ParallelUnit::GPUThread)) { - // double w_GPUThread[num]; - // for threads allocate thread local memory + // double w_GPUThread[num]; + // for threads allocate thread local memory + doIndent(); + stream << elementType << " "; + op->var.accept(this); + stream << "["; + op->num_elements.accept(this); + stream << "];" << endl; + return; + } + // __shared__ double w_GPUThread[32]; if no warps + // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps + // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; + taco_iassert(!op->is_realloc); doIndent(); - stream << elementType << " "; + stream << "__shared__ " << elementType << " "; op->var.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + stream << "_ALL"; + } stream << "["; - op->num_elements.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp])); + ir::simplify(numElements).accept(this); + } + else { + op->num_elements.accept(this); + } stream << "];" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + doIndent(); + stream << elementType << " * "; + op->var.accept(this); + + stream << " = "; + op->var.accept(this); + stream << "_ALL + "; + parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); + stream << " * "; + parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); + stream << ";" << endl; + } return; } - // __shared__ double w_GPUThread[32]; if no warps - // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps - // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; - taco_iassert(!op->is_realloc); - doIndent(); - stream << "__shared__ " << elementType << " "; - op->var.accept(this); - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - stream << "_ALL"; - } - stream << "["; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp])); - ir::simplify(numElements).accept(this); - } - else { - op->num_elements.accept(this); - } - stream << "];" << endl; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - doIndent(); - stream << elementType << " * "; - op->var.accept(this); - - stream << " = "; - op->var.accept(this); - stream << "_ALL + "; - parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); - stream << " * "; - parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); - stream << ";" << endl; - } - return; - } diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 48cd8b557..9a5baf677 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -1377,10 +1377,7 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa return *this; } -IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, bool shared_mem) const { - if (shared_mem){ - std::cout << "is shared_mem " << __FILE__ << " " << __LINE__ << std::endl; - } +IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace) const { IndexStmt transformed = *this; string reason; if (i != iw) { @@ -1391,25 +1388,7 @@ IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorV } } - transformed = Transformation(Precompute(expr, i, iw, workspace, shared_mem)).apply(transformed, &reason); - if (!transformed.defined()) { - taco_uerror << reason; - } - return transformed; -} - -IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const { - IndexStmt transformed = *this; - string reason; - if (i != iw) { - IndexVarRel rel = IndexVarRel(new PrecomputeRelNode(i, iw)); - transformed = Transformation(AddSuchThatPredicates({rel})).apply(transformed, &reason); - if (!transformed.defined()) { - taco_uerror << reason; - } - } - - transformed = Transformation(Precompute(expr, i, iw, workspace)).apply(transformed, &reason); + transformed = Transformation(Precompute(expr, i, iw, workspace, gpuworkspace)).apply(transformed, &reason); if (!transformed.defined()) { taco_uerror << reason; } @@ -1840,7 +1819,7 @@ struct TensorVar::Content { Type type; Format format; Schedule schedule; - bool shared_memory = false; + GPUWorkspace gpuworkspace; }; TensorVar::TensorVar() : content(nullptr) { @@ -1862,23 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format) : TensorVar(util::uniqueName('A'), type, format) { } -TensorVar::TensorVar(const string& name, const Type& type, const Format& format) - : content(new Content) { - content->name = name; - content->type = type; - content->format = format; -} - -TensorVar::TensorVar(const string& name, const Type& type, const Format& format, bool is_shared_mem) +TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace) : content(new Content) { content->name = name; content->type = type; content->format = format; - content->shared_memory = is_shared_mem; + content->gpuworkspace = gpuworkspace; } -bool TensorVar::is_shared_memory() { - return content->shared_memory; +GPUWorkspace TensorVar::getGPUWorkspace() { + return content->gpuworkspace; } std::string TensorVar::getName() const { diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index f1a1eae00..60f2bf8a0 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -133,28 +133,19 @@ struct Precompute::Content { IndexVar i; IndexVar iw; TensorVar workspace; - bool is_shared_mem = false; + GPUWorkspace gpuworkspace = GPUWorkspace::None; }; Precompute::Precompute() : content(nullptr) { } Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw, - TensorVar workspace) : content(new Content) { + TensorVar workspace, GPUWorkspace gpuworkspace) : content(new Content) { content->expr = expr; content->i = i; content->iw = iw; content->workspace = workspace; - content->is_shared_mem = false; -} - -Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw, - TensorVar workspace, bool shared_mem) : content(new Content) { - content->expr = expr; - content->i = i; - content->iw = iw; - content->workspace = workspace; - content->is_shared_mem = shared_mem; + content->gpuworkspace = gpuworkspace; } IndexExpr Precompute::getExpr() const { @@ -246,25 +237,12 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexStmt s = foralli.getStmt(); TensorVar ws = precompute.getWorkspace(); IndexExpr e = precompute.getExpr(); - std::cout << __FILE__ << ": " << __LINE__ << "precompute getExpr e = " << e << std::endl; - - if (ws.is_shared_memory()){ - std::cout << __FILE__ << __LINE__ << " workspace is shared memory! " << std::endl; - } - IndexVar iw = precompute.getiw(); - std::cout << "precompute getiw iw = " << iw << std::endl; - std::cout << "precompute ws = " << ws << std::endl; - std::cout << "precompute s = " << s << std::endl; IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); - std::cout << "precompute consumer = " << consumer << std::endl; - IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}})); - std::cout << "precompute producer = " << producer << std::endl; Where where(consumer, producer); - std::cout << "precompute where = " << where << std::endl; stmt = where; return; diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp index 7656d9b26..60d7d24f6 100644 --- a/src/ir/ir.cpp +++ b/src/ir/ir.cpp @@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const { return false; } -Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, bool is_shared_mem) { +Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) { Var *var = new Var; var->type = type; var->name = name; @@ -249,12 +249,7 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, boo // TODO: is_ptr and is_tensor should be part of type var->is_ptr = is_ptr; var->is_tensor = is_tensor; - var->is_shared_memory = is_shared_mem; - - if (var->is_shared_memory) - { - std::cout << var->name << " is shared memory" << __FILE__ << __LINE__ << std::endl; - } + var->gpuworkspace = gpuworkspace; return var; } @@ -650,10 +645,6 @@ Stmt For::make(Expr var, Expr start, Expr end, Expr increment, Stmt body, LoopKind kind, ParallelUnit parallel_unit, size_t unrollFactor, int vec_width) { For *loop = new For; loop->var = var; - if (var.as() && var.as()->name == "pprecomputed"){ - std::cout << " vAr is named Pprecomputed " << std::endl; - } - loop->start = start; loop->end = end; loop->increment = increment; @@ -763,13 +754,6 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme "Can only allocate memory for a pointer-typed Var"; taco_iassert(num_elements.type().isInt() || num_elements.type().isUInt()) << "Can only allocate an integer-valued number of elements"; - - if (var.as() && var.as()->is_shared_memory){ - std::cout << " vAr is Marked as shaRed memory, nice! " << std::endl; - } - if (var.as() && var.as()->name == "precomputed"){ - std::cout << " vAr is named precomputed " << std::endl; - } Allocate* alloc = new Allocate; alloc->var = var; alloc->num_elements = num_elements; diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index a7155438c..122805776 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -4,4 +4,5 @@ namespace taco { const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; +const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"}; } diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 32ee42f07..a7bb87f32 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -107,7 +107,6 @@ static bool hasStores(Stmt stmt) { Stmt LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) { - std::cout << "file, line, function " << __FILE__ << " " << __LINE__ << " lower\n"; this->assemble = assemble; this->compute = compute; definedIndexVarsOrdered = {}; @@ -127,17 +126,14 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) // Create variables for temporaries // TODO Remove this for (auto& temp : temporaries) { - std::cout << __FILE__ << " " << __LINE__ << " temp: " << temp << std::endl; - if (((TensorVar)(temp)).is_shared_memory()){ - std::cout << __FILE__ << " " << __LINE__ << "TENSOR VAR IS SHARED MEMORY" << std::endl; + if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){ ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), - true, true, true); + true, true, ((TensorVar)(temp)).getGPUWorkspace()); tensorVars.insert({temp, irVar}); } else{ ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), true, true); - std::cout << __FILE__ << " " << __LINE__ << "TENSOR VAR IS **NOT** SHARED MEMORY" << std::endl; tensorVars.insert({temp, irVar}); } @@ -1292,16 +1288,13 @@ Stmt LowererImpl::lowerWhere(Where where) { else { if (generateComputeCode()) { Expr values; - if (temporary.is_shared_memory()){ - std::cout << "~~~~~~ at lower where " << temporary.getName() << " is marked as shared. " << std::endl; + if (temporary.getGPUWorkspace() != GPUWorkspace::None){ values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), - true, false, true); + true, false, temporary.getGPUWorkspace()); } else { - std::cout << "~~~~~~ at lower where " << temporary.getName() << " is **NOT** marked as shared. " << std::endl; - values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); @@ -1312,13 +1305,7 @@ Stmt LowererImpl::lowerWhere(Where where) { Dimension temporarySize = temporary.getType().getShape().getDimension(0); Expr size; if (temporarySize.isFixed()) { - // if (temporary.is_shared_memory()){ size = ir::Literal::make(temporarySize.getSize()); - // } - // else{ - // size = ir::Literal::make(temporarySize.getSize()); - // } - // size = ir::Literal::make(temporarySize.getSize()); } else if (temporarySize.isIndexVarSized()) { IndexVar var = temporarySize.getIndexVarSize(); @@ -1330,7 +1317,6 @@ Stmt LowererImpl::lowerWhere(Where where) { } // no decl needed for shared memory - // ???? Stmt decl = Stmt(); if((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); @@ -1340,8 +1326,8 @@ Stmt LowererImpl::lowerWhere(Where where) { Expr p = Var::make("p" + temporary.getName(), Int()); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - // if shared mem, do something - + // TODO: Should this zero init loop even exist for precompute for dense shared + // memory workspaces on GPUs? Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); freeTemporary = Free::make(values); diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 0741e6542..de7e648f7 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -129,19 +129,18 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor A, IndexExpr precompute .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } -IndexStmt scheduleSpMVGPU_wsp(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) { +IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) { int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE; int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE; IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre"); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, true); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory); return stmt.fuse(i, j, f) .pos(f, fpos, A(i, j)) .split(fpos, block, fpos1, NNZ_PER_TB) .split(fpos1, warp, fpos2, NNZ_PER_WARP) .split(fpos2, thread, thread_nz, NNZ_PER_THREAD) .reorder({block, warp, thread, thread_nz}) - .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, true) - // .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory) .unroll(thread_nz_pre, NNZ_PER_THREAD) .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) @@ -207,27 +206,6 @@ IndexStmt scheduleSpMMGPU(IndexStmt stmt, Tensor A, IndexExpr precompute .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } -IndexStmt scheduleSpMMGPU_wsp(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_WARP=8, int BLOCK_SIZE=256) { - int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); - IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), nnz_pre("nnz_pre"); - IndexVar dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread"); - IndexVar thread_nz("thread_nz"); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(nnz)}), taco::dense); - return stmt.reorder({i, j, k}) - .fuse(i, j, f) - .pos(f, fpos, A(i, j)) - .split(fpos, block, fpos1, NNZ_PER_TB) - .split(fpos1, warp, nnz, NNZ_PER_WARP) - .split(k, dense_val_unbounded, thread, WARP_SIZE) - .reorder({block, warp, thread, dense_val_unbounded, nnz}) - //.precompute(precomputedExpr, nnz, nnz, precomputed) - .bound(dense_val_unbounded, dense_val, 4, BoundType::MaxExact) - //.unroll(dense_val, 4) - .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) - .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) - .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); -} - IndexStmt scheduleSDDMMGPU(IndexStmt stmt, Tensor B, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256, int CO_FACTOR=4) { int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); IndexVar f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"); @@ -285,6 +263,25 @@ IndexStmt scheduleTTVGPU(IndexStmt stmt, Tensor B, IndexExpr precomputed .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } +IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor B, IndexExpr precomputedExpr, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256) { + int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); + IndexVar jk("jk"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), fpos2("fpos2"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre"); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory); + + return stmt.fuse(j, k, jk) + .fuse(i, jk, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, block, fpos1, NNZ_PER_TB) + .split(fpos1, warp, fpos2, NNZ_PER_WARP) + .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE) + .reorder({block, warp, thread, thread_nz}) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory) + .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE) + .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) + .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) + .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); +} + IndexStmt scheduleMTTKRPGPU(IndexStmt stmt, Tensor B, int NNZ_PER_WARP=16, int BLOCK_SIZE=256) { int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); IndexVar kl("kl"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread"); @@ -837,9 +834,8 @@ TEST(scheduling_eval, spmvGPU) { ASSERT_TENSOR_EQ(expected, y); } -TEST(scheduling_eval, smpreGPU) { +TEST(scheduling_eval, spmvGPU_dsm) { if (!should_use_CUDA_codegen()) { - std::cout << "not using cuda" << std::endl; return; } int NUM_I = 425; @@ -868,24 +864,18 @@ TEST(scheduling_eval, smpreGPU) { A.pack(); IndexExpr precomputed = A(i, j) * x(j); y(i) = precomputed; - // std::cout << y(i) << std::endl; IndexStmt stmt = y.getAssignment().concretize(); - // stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed); - stmt = scheduleSpMVGPU_wsp(stmt, A, precomputed); - std::cout << "stmt:" << std::endl; - std::cout << stmt << std::endl; - + stmt = scheduleSpMVGPU_dsm(stmt, A, precomputed); //printToFile("spmv_gpu", stmt); - // y.compile(stmt); - // Just overload th compile y.compile(stmt); y.assemble(); y.compute(); - // y.recompile("/tmp/taco_tmp_Ryt3Xk/vdhf888td940"); + // Example of using "recompile" to debug + // y.recompile("/tmp/taco_tmp_88888/xxxxxxx"); // y.reassemble(); // y.recompute(); @@ -897,64 +887,6 @@ TEST(scheduling_eval, smpreGPU) { ASSERT_TENSOR_EQ(expected, y); } -TEST(scheduling_eval, spmmGPU_wsp) { - - if (!should_use_CUDA_codegen()) { - std::cout << "not using cuda" << std::endl; - return; - } - int NUM_I = 1021/10; - int NUM_J = 1039/10; - int NUM_K = 128; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor C("C", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}})); - - srand(4343211); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - } - - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - - A.pack(); - B.pack(); - IndexExpr precomputed = A(i, j); - C(i, k) = B(j, k) * precomputed; - - IndexStmt stmt = C.getAssignment().concretize(); - stmt = scheduleSpMMGPU_wsp(stmt, A, precomputed); - - //printToFile("spmm_gpu", stmt); - - C.compile(stmt); - C.assemble(); - C.compute(); - - // C.recompile("/tmp/taco_tmp_HT60Vh/vdhf888td940"); - // C.reassemble(); - // C.recompute(); - - Tensor expected("expected", {NUM_I, NUM_K}, Format({{Dense, Dense}, {1, 0}})); - expected(i, k) = A(i, j) * B(j, k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, C); -} - - TEST(scheduling_eval, spmmGPU) { if (!should_use_CUDA_codegen()) { return; @@ -1223,6 +1155,58 @@ TEST(scheduling_eval, ttvGPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvGPU_dsm) { + if (!should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, {Dense}); + + srand(353252); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + IndexExpr precomputedExpr = B(i,j,k) * c(k); + A(i,j) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVGPU_dsm(stmt, B, precomputedExpr); + + //printToFile("ttv_gpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, mttkrpGPU) { if (!should_use_CUDA_codegen()) { return; From fc9c191c02286329bcba63e92b60b8cec8795229 Mon Sep 17 00:00:00 2001 From: Amalee Wilson Date: Sun, 21 Jun 2020 23:07:42 -0700 Subject: [PATCH 3/4] Remove redundant GPU workspace specification --- include/taco/index_notation/index_notation.h | 2 +- src/index_notation/index_notation.cpp | 4 ++-- test/tests-scheduling-eval.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index 096ac6756..f4a42edc3 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -622,7 +622,7 @@ class IndexStmt : public util::IntrusivePtr { /// The precompute transformation is described in kjolstad2019 /// allows us to leverage scratchpad memories and /// reorder computations to increase locality - IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None) const; + IndexStmt precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const; /// bound specifies a compile-time constraint on an index variable's /// iteration space that allows knowledge of the diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 9a5baf677..7b9c10bf5 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -1377,7 +1377,7 @@ IndexStmt IndexStmt::divide(IndexVar i, IndexVar i1, IndexVar i2, size_t splitFa return *this; } -IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace) const { +IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace) const { IndexStmt transformed = *this; string reason; if (i != iw) { @@ -1388,7 +1388,7 @@ IndexStmt IndexStmt::precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorV } } - transformed = Transformation(Precompute(expr, i, iw, workspace, gpuworkspace)).apply(transformed, &reason); + transformed = Transformation(Precompute(expr, i, iw, workspace)).apply(transformed, &reason); if (!transformed.defined()) { taco_uerror << reason; } diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index de7e648f7..3b5b94c71 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -140,7 +140,7 @@ IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor A, IndexExpr precom .split(fpos1, warp, fpos2, NNZ_PER_WARP) .split(fpos2, thread, thread_nz, NNZ_PER_THREAD) .reorder({block, warp, thread, thread_nz}) - .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) .unroll(thread_nz_pre, NNZ_PER_THREAD) .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) @@ -275,7 +275,7 @@ IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor B, IndexExpr precomp .split(fpos1, warp, fpos2, NNZ_PER_WARP) .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE) .reorder({block, warp, thread, thread_nz}) - .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed, GPUWorkspace::DenseSharedMemory) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE) .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) From bbff9d70284450708eed43488bd3590fe24caff7 Mon Sep 17 00:00:00 2001 From: Amalee Wilson Date: Sun, 21 Jun 2020 23:21:02 -0700 Subject: [PATCH 4/4] More cleaned up first pass for dense shared memory GPU workspaces --- include/taco/index_notation/index_notation.h | 1 - include/taco/index_notation/provenance_graph.h | 1 - include/taco/index_notation/transformations.h | 2 +- include/taco/ir/ir.h | 1 - src/codegen/codegen_cuda.cpp | 15 --------------- src/index_notation/transformations.cpp | 4 +--- src/lower/lowerer_impl.cpp | 7 +++---- 7 files changed, 5 insertions(+), 26 deletions(-) diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index f4a42edc3..73607729a 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -849,7 +849,6 @@ class TensorVar : public util::Comparable { TensorVar(const Type& type); TensorVar(const std::string& name, const Type& type); TensorVar(const Type& type, const Format& format); - // TensorVar(const std::string& name, const Type& type, const Format& format); TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None); // Returns the type of GPU workspace this TensorVar is, which is None by default. diff --git a/include/taco/index_notation/provenance_graph.h b/include/taco/index_notation/provenance_graph.h index e5f2b195e..cc2b12d7b 100644 --- a/include/taco/index_notation/provenance_graph.h +++ b/include/taco/index_notation/provenance_graph.h @@ -226,7 +226,6 @@ bool operator==(const BoundRelNode&, const BoundRelNode&); /// This allows precomputeVar to be scheduled separately from the parentVar struct PrecomputeRelNode : public IndexVarRelNode { PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar); - PrecomputeRelNode(IndexVar parentVar, IndexVar precomputeVar, bool shared_mem); const IndexVar& getParentVar() const; const IndexVar& getPrecomputeVar() const; diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index f91783e56..3dd4fe141 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -86,7 +86,7 @@ std::ostream &operator<<(std::ostream &, const Reorder &); class Precompute : public TransformationInterface { public: Precompute(); - Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace, GPUWorkspace gpuworkspace=GPUWorkspace::None); + Precompute(IndexExpr expr, IndexVar i, IndexVar iw, TensorVar workspace); IndexExpr getExpr() const; IndexVar geti() const; diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 2befde4ed..5445f91ea 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -686,7 +686,6 @@ struct Allocate : public StmtNode { Expr num_elements; Expr old_elements; // used for realloc in CUDA bool is_realloc; - bool is_shared_memory; static Stmt make(Expr var, Expr num_elements, bool is_realloc=false, Expr old_elements=Expr()); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index a08f89d7a..fabc0d5ca 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -1150,21 +1150,6 @@ void CodeGen_CUDA::visit(const Allocate* op) { return; } - - - - // if (parentParallelUnits.count(ParallelUnit::GPUThread)) { - // // double w_GPUThread[num]; - // // for threads allocate thread local memory - // doIndent(); - // stream << elementType << " "; - // op->var.accept(this); - // stream << "["; - // op->num_elements.accept(this); - // stream << "];" << endl; - // return; - // } - } string variable_name; if (op->is_realloc) { diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 60f2bf8a0..03ddba8b4 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -133,19 +133,17 @@ struct Precompute::Content { IndexVar i; IndexVar iw; TensorVar workspace; - GPUWorkspace gpuworkspace = GPUWorkspace::None; }; Precompute::Precompute() : content(nullptr) { } Precompute::Precompute(IndexExpr expr, IndexVar i, IndexVar iw, - TensorVar workspace, GPUWorkspace gpuworkspace) : content(new Content) { + TensorVar workspace) : content(new Content) { content->expr = expr; content->i = i; content->iw = iw; content->workspace = workspace; - content->gpuworkspace = gpuworkspace; } IndexExpr Precompute::getExpr() const { diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index a7bb87f32..890aee524 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -129,12 +129,12 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){ ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), true, true, ((TensorVar)(temp)).getGPUWorkspace()); - tensorVars.insert({temp, irVar}); + tensorVars.insert({temp, irVar}); } else{ ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), true, true); - tensorVars.insert({temp, irVar}); + tensorVars.insert({temp, irVar}); } @@ -1300,12 +1300,11 @@ Stmt LowererImpl::lowerWhere(Where where) { true, false); } - taco_iassert(temporary.getType().getOrder() == 1); // TODO Dimension temporarySize = temporary.getType().getShape().getDimension(0); Expr size; if (temporarySize.isFixed()) { - size = ir::Literal::make(temporarySize.getSize()); + size = ir::Literal::make(temporarySize.getSize()); } else if (temporarySize.isIndexVarSized()) { IndexVar var = temporarySize.getIndexVarSize();