diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index ba7ebe321f..805b6dac5d 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -13,12 +13,14 @@ namespace turbomind { -ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim): +ModelRequest::ModelRequest( + Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size): gateway_{gateway}, data_type_{data_type}, session_len_{session_len}, vocab_size_{vocab_size}, - hidden_dim_{hidden_dim} + hidden_dim_{hidden_dim}, + tp_size_{tp_size} { } @@ -127,8 +129,14 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output r->output_ids = outputs_->at("output_ids"); r->sequence_length = outputs_->at("sequence_length"); + r->matchers.clear(); if (grammar_) { - r->matcher = std::make_shared(*grammar_); + for (int i = 0; i < tp_size_; ++i) { + r->matchers.push_back(std::make_shared(*grammar_)); + } + } + else { + r->matchers.resize(tp_size_); } // Keep a weak reference for canceling the request diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index 7582163095..b05a980312 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -15,7 +15,7 @@ class ModelRequest { public: virtual ~ModelRequest() = default; - ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim); + ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size); // Cancel running request void Cancel(); @@ -50,6 +50,7 @@ class ModelRequest { const int session_len_; const int hidden_dim_; const int vocab_size_; + const int tp_size_; uint64_t session_id_; diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index aa50a48100..a5661bf82c 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -154,7 +154,7 @@ struct Request { kInconsistency = 9, // Inconsistent request parameters, e.g. prefix caching is not allowed in interactive mode }; - std::shared_ptr matcher; + std::vector> matchers; // GrammarMatchers for different threads (tp_size) }; inline void UpdateState(Request& r, int status, int seq_len) diff --git a/src/turbomind/layers/BaseDynamicDecodeLayer.h b/src/turbomind/layers/BaseDynamicDecodeLayer.h index a3e14407ff..239b8bc78d 100644 --- a/src/turbomind/layers/BaseDynamicDecodeLayer.h +++ b/src/turbomind/layers/BaseDynamicDecodeLayer.h @@ -31,6 +31,7 @@ class BaseDynamicDecodeLayer { int vocab_size_padded; cudaStream_t stream; const cudaDeviceProp* device_prop; + int tp_rank; }; virtual ~BaseDynamicDecodeLayer() = default; @@ -42,6 +43,7 @@ class BaseDynamicDecodeLayer { vocab_size_padded_ = param.vocab_size_padded; stream_ = param.stream; device_prop_ = param.device_prop; + tp_rank_ = param.tp_rank; }; virtual void Setup(const std::vector& rs, const TensorMap& args) = 0; @@ -54,6 +56,7 @@ class BaseDynamicDecodeLayer { int vocab_size_padded_; cudaStream_t stream_; const cudaDeviceProp* device_prop_; + int tp_rank_; }; } // namespace turbomind diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc index 5a66bf1fb6..11a54cb6aa 100644 --- a/src/turbomind/layers/DynamicDecodeLayer.cc +++ b/src/turbomind/layers/DynamicDecodeLayer.cc @@ -31,11 +31,14 @@ DynamicDecodeLayer::DynamicDecodeLayer(DataType dtype, int vocab_size, int vocab_size_padded, cudaStream_t stream, - const cudaDeviceProp* device_prop) + const cudaDeviceProp* device_prop, + int tp_rank): + tp_rank_{tp_rank} { TM_LOG_DEBUG(__PRETTY_FUNCTION__); TM_CHECK(dtype == kFloat32); - BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop}; + BaseDynamicDecodeLayer::BaseParam param{ + max_batch_size, vocab_size, vocab_size_padded, stream, device_prop, tp_rank}; layers_.emplace_back(new LogitsProcessorLayer{param}); layers_.emplace_back(new GuidedDecodeMaskLayer{param}); layers_.emplace_back(new SamplingLayer{param}); diff --git a/src/turbomind/layers/DynamicDecodeLayer.h b/src/turbomind/layers/DynamicDecodeLayer.h index c527ff8e0f..233f7f6f9b 100644 --- a/src/turbomind/layers/DynamicDecodeLayer.h +++ b/src/turbomind/layers/DynamicDecodeLayer.h @@ -33,7 +33,8 @@ class DynamicDecodeLayer { int vocab_size, int vocab_size_padded, cudaStream_t stream, - const cudaDeviceProp* device_prop); + const cudaDeviceProp* device_prop, + int tp_rank); ~DynamicDecodeLayer(); @@ -42,6 +43,7 @@ class DynamicDecodeLayer { void Forward(TensorMap& args); private: + int tp_rank_; std::vector> layers_; }; diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc index 2262992902..2371fe56c0 100644 --- a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc +++ b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc @@ -33,7 +33,7 @@ void GuidedDecodeMaskLayer::Setup(const std::vector& rs, cons TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); matchers_.clear(); for (const auto& r : rs) { - matchers_.push_back(r->matcher); + matchers_.push_back(r->matchers[tp_rank_]); } } diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc index 653a8874d8..25c48e3f23 100644 --- a/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc +++ b/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc @@ -15,6 +15,7 @@ */ #include "src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.h" +#include "src/turbomind/core/context.h" namespace turbomind { @@ -29,7 +30,7 @@ void GuidedDecodeUpdateLayer::Setup(const std::vector& rs, co TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); matchers_.clear(); for (const auto& r : rs) { - matchers_.push_back(r->matcher); + matchers_.push_back(r->matchers[tp_rank_]); } } @@ -45,6 +46,7 @@ void GuidedDecodeUpdateLayer::Forward(TensorMap& args) FT_CHECK(bsz == matchers_.size()); Copy(output_ids.slice(step * bsz, bsz), output_ids_buf); + core::Context::stream().Sync(); for (size_t i = 0; i < bsz; ++i) { const auto& matcher = matchers_[i]; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 68185eac38..729f9b380a 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType dtype, // using float to avoid data overflow dynamic_decode_ = std::make_unique( - kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop); + kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop, engine.mlp_tp_rank); } void LlamaV2::updateEmbedding(char* decoder_input, diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 853b0a96d8..841e538fb4 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -454,8 +454,12 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int device_i { FT_CHECK(engines_[device_id] != nullptr); - return std::make_unique( - gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units); + return std::make_unique(gateway_.get(), + dtype_, + engine_param_.session_len, + model_param_.vocab_size, + model_param_.hidden_units, + comm_size_); } void LlamaTritonModel::createSharedWeights(int device_id, int rank)