diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx index 2d01cb1afe6a1..99c77dd506275 100644 --- a/tmva/sofie/inc/TMVA/RModel.hxx +++ b/tmva/sofie/inc/TMVA/RModel.hxx @@ -31,6 +31,7 @@ private: std::vector fInputTensorNames; // input tensor names using ONNX order std::vector> fOperators; + std::vector> fConstantOperators; std::vector> fSubGraphs; /// shape); void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape); - void AddOperator(std::unique_ptr op, int order_execution = -1); - void AddOperatorReference(ROperator *op, int order_execution = -1) + void AddOperator(std::unique_ptr op, size_t order_execution = -1); + void AddOperatorReference(ROperator *op, size_t order_execution = -1) { std::unique_ptr tmp(op); AddOperator(std::move(tmp), order_execution); } + void AddConstantOperator(std::unique_ptr op); void AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); void AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, @@ -161,6 +163,7 @@ protected: void GenerateIntermediateMemoryPool(); // Generate all session code void GenerateSessionCode(); + void CheckAndFuseOperators(); public: const std::vector &GetInputTensorNames() const { return fInputTensorNames; } diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx index 341ce48fd7073..3eb55b5d0d559 100644 --- a/tmva/sofie/inc/TMVA/ROperator.hxx +++ b/tmva/sofie/inc/TMVA/ROperator.hxx @@ -2,6 +2,7 @@ #define TMVA_SOFIE_ROPERATOR #include +#include #include #include "TMVA/SOFIE_common.hxx" @@ -15,6 +16,28 @@ namespace SOFIE{ class RModel; +enum class OperatorKind { + GEMM = 0, + LAYERNORM = 1, + RELU = 2, + CONSTANT = 3, + CONSTANTOFSHAPE = 4, + UNDEFINED = 5 +}; + +inline const char* toString(OperatorKind kind) { + switch (kind) { + case OperatorKind::GEMM: return "GEMM"; + case OperatorKind::LAYERNORM: return "LAYERNORM"; + case OperatorKind::RELU: return "RELU"; + case OperatorKind::CONSTANT: return "CONSTANT"; + case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE"; + case OperatorKind::UNDEFINED: return "UNDEFINED"; + default: return "UNKNOWN"; + } +} +inline std::set FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM }; + class ROperator{ @@ -32,13 +55,17 @@ public: // generate session data members specific to operator virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } virtual std::string Header() { return "";} + virtual std::string GetFusableOutputTensorName() { return "";} + virtual void UpdateFusableTensorName(std::string){ return;}; + //virtual void Forward_reference() = 0; //virtual void Forward_blas() = 0; virtual ~ROperator(){} protected: - + OperatorKind fKind = OperatorKind::UNDEFINED; + size_t fOpOrder = 0; const std::string SP = " "; ///< space used to correctly indent the generated C++ code bool fUseSession = false; ///< flag to identify if using the session class bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) @@ -54,7 +81,17 @@ public: std::span GetOpOutputTensors() const { return fOutputTensorNames; } - + + OperatorKind GetOpKind(){ + return fKind; + } + void RegisterOperatorOrder(const size_t ord){ + fOpOrder = ord; + } + size_t GetOpOrder(){ + return fOpOrder; + } + }; diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx index d16befc1fc725..b5de973dc6e5d 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx @@ -33,7 +33,13 @@ public: fShape(shape), fValues(values), fAttrType(type) - { + { + fKind = OperatorKind::CONSTANT; + if (!fNX.empty()) { + // case of ConstantOfShape (since no inputs in case of Constant operator) + fIsConstantOfShape = true; + fKind = OperatorKind::CONSTANTOFSHAPE; + } fInputTensorNames = { }; fOutputTensorNames = { }; } @@ -50,9 +56,9 @@ public: void Initialize(RModel& model) override { //input must be a graph input, or already initialized intermediate tensor size_t length = 1; + + // constant of shape case if (!fNX.empty()) { - // case of ConstantOfShape (since no inputs in case of Constant operator) - fIsConstantOfShape = true; if (model.CheckIfTensorAlreadyExist(fNX) == false){ throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor is not found in model"); } diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx index 0aaed283e5a5e..988a9171fea91 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx @@ -49,6 +49,7 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; static_assert(std::is_same_v, @@ -61,9 +62,11 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) { + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; + fInputTensorNames = { fNA, fNB, fNC }; fOutputTensorNames = { fNY }; } @@ -383,7 +386,13 @@ namespace SOFIE{ } std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } - + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name){ + fNY = UTILITY::Clean_name(fusable_tensor_name); + } }; diff --git a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx index 01465f1a01cf4..dddfc8079c89a 100644 --- a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx @@ -58,7 +58,8 @@ public: : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)), fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev)) - { + { + fKind = OperatorKind::LAYERNORM; fInputTensorNames = { fNX, fNScale }; if (!fNB.empty()){ fInputTensorNames.emplace_back(fNB); @@ -336,6 +337,15 @@ public: std::vector GetBlasRoutines() override { return { std::string("Axpy") }; } std::vector GetStdLibs() override { return { std::string("cmath") }; } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name){ + fNX = UTILITY::Clean_name(fusable_tensor_name); + fNY = UTILITY::Clean_name(fusable_tensor_name); + } }; } // namespace SOFIE diff --git a/tmva/sofie/inc/TMVA/ROperator_Relu.hxx b/tmva/sofie/inc/TMVA/ROperator_Relu.hxx index fe3e3114c46d8..829e7694b7049 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Relu.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Relu.hxx @@ -25,6 +25,7 @@ public: ROperator_Relu(){} ROperator_Relu(std::string nameX, std::string nameY): fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::RELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -66,6 +67,16 @@ public: return out.str(); } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name){ + fNX = UTILITY::Clean_name(fusable_tensor_name); + fNY = UTILITY::Clean_name(fusable_tensor_name); + } + }; }//SOFIE diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx index 6cd09dda15241..353dec3cec9e2 100644 --- a/tmva/sofie/src/RModel.cxx +++ b/tmva/sofie/src/RModel.cxx @@ -125,14 +125,20 @@ void RModel::AddInputTensorName(std::string input_name) { fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name)); } -void RModel::AddOperator(std::unique_ptr op, int order_execution) { +void RModel::AddOperator(std::unique_ptr op, size_t order_execution) { AddBlasRoutines(op->GetBlasRoutines()); auto libs = op->GetStdLibs(); auto op_input_tensors = op->GetOpInputTensors(); for (auto& stdlib : libs) { AddNeededStdLib(stdlib); } - if (order_execution >= 0) { + if (op->GetOpKind()==OperatorKind::CONSTANT){ + AddConstantOperator(std::move(op)); + return; + } + + op->RegisterOperatorOrder(order_execution); + if (order_execution >= 0 && order_execution <= fOperators.size()) { fOperators.insert(fOperators.begin() + order_execution, std::move(op)); } else { fOperators.push_back(std::move(op)); @@ -140,16 +146,21 @@ void RModel::AddOperator(std::unique_ptr op, int order_execution) { // storing the last usage of tensors which are input to // operators (but are not inputs to the model, i.e. they are intermediate - // tensors). This information is needed to keep a check on when a - // particular intermediate tensor can be flushed to free up memory for reuse. - for(size_t index = 0; index op){ + fConstantOperators.push_back(std::move(op)); } void RModel::AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { @@ -208,7 +219,8 @@ void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, st void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape) { tensor_name = UTILITY::Clean_name(tensor_name); if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); + // throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); + return; } TensorInfo new_tensor {type, shape}; fIntermediateTensorInfos[tensor_name] = new_tensor; @@ -279,7 +291,7 @@ std::string RModel::AllocateIntermediateMemory(std::span { std::stringstream code; - auto declareIntermediateTensor = [this, &code](std::string const &name, int size, int location) { + auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) { std::string typeName = ConvertTypeToString(GetTensorType(name)); code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes"; code << "\n" @@ -334,11 +346,10 @@ std::string RModel::AllocateIntermediateMemory(std::span void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ for (auto &it : op_input_tensors){ // last occurence of the tensor is reached => flush it from memory - if (fIntermediateTensorFrequencyLookup[it] == op_idx) { + if (fIntermediateTensorFrequencyLookup[it] == fOperators[op_idx]->GetOpOrder()){ for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) { if (chunk->second.tensor_name == it) { - // check if nearby chunks in available memory can coalesce auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx @@ -368,6 +379,57 @@ void RModel::CheckAndFlushIntermediateMemory(std::span o } } +void RModel::CheckAndFuseOperators() { + size_t idx = 0; + std::vector fusable_indices; + std::string fusable_propagate_tensor_name; + while (idx < fOperators.size()) { + if (fOperators[idx]->GetOpKind() != OperatorKind::GEMM) { + ++idx; + continue; + } + + fusable_indices.clear(); + fusable_propagate_tensor_name.clear(); + + fusable_indices.push_back(idx); + size_t j = idx + 1; + for (; j < fOperators.size()-1; ++j) { + auto opKind = fOperators[j]->GetOpKind(); + + // Only consider operators with fusable kinds + if (!FusableKinds.count(opKind)) { + break; + } + // std::cout<<"\nmight be fusable: "<GetFusableOutputTensorName(); + auto freqIt = fIntermediateTensorFrequencyLookup.find(tensorName); + + // Propagate tensor name only if it's not used multiple times + if (freqIt != fIntermediateTensorFrequencyLookup.end() && + (freqIt->second != fOperators[j + 1]->GetOpOrder() || + FusableKinds.count(fOperators[j + 1]->GetOpKind()) == 0)) { + // std::cout << "\nBreaking here, second: " << freqIt->second << ", idx: " << fOperators[j+1]->GetOpOrder(); + fusable_propagate_tensor_name = tensorName; + break; + } else { + fusable_indices.push_back(j); + } + } + // std::cout<<"\nstart fusing: "<GetOpKind())<<" , with: "<UpdateFusableTensorName(fusable_propagate_tensor_name); + } + } + + idx = std::max(idx + 1, j); + } +} + void RModel::Initialize(int batchSize, bool verbose) { @@ -383,7 +445,7 @@ void RModel::Initialize(int batchSize, bool verbose) { void RModel::Initialize(const std::map & inputParams, bool verbose) { fVerbose = int(verbose); - + fVerbose = 0; if (fIsInitialized) { if (verbose) std::cout << "Model is already initialized - skip initialization " << std::endl; @@ -455,28 +517,38 @@ void RModel::Initialize(const std::map & inputParams, bool if (!modelHasWeights) fUseWeightFile = false; } - // Go through model and initialize each operator - int i = 0; - std::vector temp_available_stack; // vector stores individual chunks of available memory that maybe reused - - for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ + for (size_t op_const_idx = 0; op_const_idx < fConstantOperators.size(); ++op_const_idx) { if (verbose) { - auto& r = *fOperators[op_idx].get(); - std::cout << "Initializing operator " << i << " " << typeid(r).name() << std::endl; + auto& r = *fConstantOperators[op_const_idx].get(); + std::cout << "Initializing constant operator " << op_const_idx << " " << typeid(r).name() << std::endl; } - fOperators[op_idx]->Initialize(*this); - for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ - std::string name = std::string{it}; - if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && - std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && - fInitializedTensors.find(name) == fInitializedTensors.end() && - fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){ - fIntermediateTensorFrequencyLookup[it] = op_idx; - } - } - i++; + + fConstantOperators[op_const_idx]->Initialize(*this); + } + + // Go through model and initialize each operator + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx ) { + if (verbose) { + auto& r = *fOperators[op_idx].get(); + std::cout << "Initializing operator " << op_idx << " " << typeid(r).name() << std::endl; + } + + fOperators[op_idx]->Initialize(*this); + // if(fOperators[op_idx]->GetOpName().length()==0){ + // std::cout<<"\nempty name for op, typeid: "<GetOpOutputTensors()) { + std::string name{it}; + if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && + fInputTensorInfos.find(name) == fInputTensorInfos.end() && + fInitializedTensors.find(name) == fInitializedTensors.end() && + fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()) { + fIntermediateTensorFrequencyLookup[it] = fOperators[op_idx]->GetOpOrder(); + } + } } + CheckAndFuseOperators(); fIsInitialized = true; } @@ -577,7 +649,7 @@ void RModel::GenerateIntermediateMemoryPool() { // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors // of other data types auto const &totalStack = fIntermediateMemoryInfo.total_stack; - const int memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; + const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; fGC += "std::vector fIntermediateMemoryPool = std::vector(" + std::to_string(memPoolSize) + ");\n\n"; } @@ -792,6 +864,7 @@ void RModel::GenerateSessionCode() std::string intermediate_memory_alloc_string = ""; intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + std::cout<GetOpKind()); intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); } @@ -875,7 +948,7 @@ void RModel::GenerateSessionCode() fGC += "}\n\n"; } - + fGC += doInferSignature + "{\n"; fGC += "\n"; diff --git a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx index 7b4ade2b6bc09..ad0221d8664b5 100644 --- a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx +++ b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx @@ -676,7 +676,7 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & } while ((int)nodesOrder.size() < graph.node_size()); - // find list of children for each operator (used for fusing oiperators) + // find list of children for each operator (used for fusing operators) std::vector> nodesChildren(graph.node_size()); for (int k = 0; k < graph.node_size(); k++) {