Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/turbomind/engine/model_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@

namespace turbomind {

ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim):
ModelRequest::ModelRequest(
Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size):
gateway_{gateway},
data_type_{data_type},
session_len_{session_len},
vocab_size_{vocab_size},
hidden_dim_{hidden_dim}
hidden_dim_{hidden_dim},
tp_size_{tp_size}
{
}

Expand Down Expand Up @@ -127,8 +129,14 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
r->output_ids = outputs_->at("output_ids");
r->sequence_length = outputs_->at("sequence_length");

r->matchers.clear();
if (grammar_) {
r->matcher = std::make_shared<xgrammar::GrammarMatcher>(*grammar_);
for (int i = 0; i < tp_size_; ++i) {
r->matchers.push_back(std::make_shared<xgrammar::GrammarMatcher>(*grammar_));
}
}
else {
r->matchers.resize(tp_size_);
}

// Keep a weak reference for canceling the request
Expand Down
3 changes: 2 additions & 1 deletion src/turbomind/engine/model_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class ModelRequest {
public:
virtual ~ModelRequest() = default;

ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim);
ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size);

// Cancel running request
void Cancel();
Expand Down Expand Up @@ -50,6 +50,7 @@ class ModelRequest {
const int session_len_;
const int hidden_dim_;
const int vocab_size_;
const int tp_size_;

uint64_t session_id_;

Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/engine/request.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ struct Request {
kInconsistency = 9, // Inconsistent request parameters, e.g. prefix caching is not allowed in interactive mode
};

std::shared_ptr<xgrammar::GrammarMatcher> matcher;
std::vector<std::shared_ptr<xgrammar::GrammarMatcher>> matchers; // GrammarMatchers for different threads (tp_size)
};

inline void UpdateState(Request& r, int status, int seq_len)
Expand Down
3 changes: 3 additions & 0 deletions src/turbomind/layers/BaseDynamicDecodeLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class BaseDynamicDecodeLayer {
int vocab_size_padded;
cudaStream_t stream;
const cudaDeviceProp* device_prop;
int tp_rank;
};

virtual ~BaseDynamicDecodeLayer() = default;
Expand All @@ -42,6 +43,7 @@ class BaseDynamicDecodeLayer {
vocab_size_padded_ = param.vocab_size_padded;
stream_ = param.stream;
device_prop_ = param.device_prop;
tp_rank_ = param.tp_rank;
};

virtual void Setup(const std::vector<const Request*>& rs, const TensorMap& args) = 0;
Expand All @@ -54,6 +56,7 @@ class BaseDynamicDecodeLayer {
int vocab_size_padded_;
cudaStream_t stream_;
const cudaDeviceProp* device_prop_;
int tp_rank_;
};

} // namespace turbomind
7 changes: 5 additions & 2 deletions src/turbomind/layers/DynamicDecodeLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,14 @@ DynamicDecodeLayer::DynamicDecodeLayer(DataType dtype,
int vocab_size,
int vocab_size_padded,
cudaStream_t stream,
const cudaDeviceProp* device_prop)
const cudaDeviceProp* device_prop,
int tp_rank):
tp_rank_{tp_rank}
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_CHECK(dtype == kFloat32);
BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
BaseDynamicDecodeLayer::BaseParam param{
max_batch_size, vocab_size, vocab_size_padded, stream, device_prop, tp_rank};
layers_.emplace_back(new LogitsProcessorLayer<float>{param});
layers_.emplace_back(new GuidedDecodeMaskLayer<float>{param});
layers_.emplace_back(new SamplingLayer<float>{param});
Expand Down
4 changes: 3 additions & 1 deletion src/turbomind/layers/DynamicDecodeLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class DynamicDecodeLayer {
int vocab_size,
int vocab_size_padded,
cudaStream_t stream,
const cudaDeviceProp* device_prop);
const cudaDeviceProp* device_prop,
int tp_rank);

~DynamicDecodeLayer();

Expand All @@ -42,6 +43,7 @@ class DynamicDecodeLayer {
void Forward(TensorMap& args);

private:
int tp_rank_;
std::vector<std::unique_ptr<BaseDynamicDecodeLayer>> layers_;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void GuidedDecodeMaskLayer<T>::Setup(const std::vector<const Request*>& rs, cons
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
matchers_.clear();
for (const auto& r : rs) {
matchers_.push_back(r->matcher);
matchers_.push_back(r->matchers[tp_rank_]);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void GuidedDecodeUpdateLayer<T>::Setup(const std::vector<const Request*>& rs, co
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
matchers_.clear();
for (const auto& r : rs) {
matchers_.push_back(r->matcher);
matchers_.push_back(r->matchers[tp_rank_]);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/models/llama/LlamaV2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType dtype,

// using float to avoid data overflow
dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop, engine.mlp_tp_rank);
}

void LlamaV2::updateEmbedding(char* decoder_input,
Expand Down
8 changes: 6 additions & 2 deletions src/turbomind/triton_backend/llama/LlamaTritonModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,12 @@ std::unique_ptr<ModelRequest> LlamaTritonModel::createModelInstance(int device_i
{
FT_CHECK(engines_[device_id] != nullptr);

return std::make_unique<ModelRequest>(
gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units);
return std::make_unique<ModelRequest>(gateway_.get(),
dtype_,
engine_param_.session_len,
model_param_.vocab_size,
model_param_.hidden_units,
comm_size_);
}

void LlamaTritonModel::createSharedWeights(int device_id, int rank)
Expand Down