diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
index ba7ebe321f..805b6dac5d 100644
--- a/src/turbomind/engine/model_request.cc
+++ b/src/turbomind/engine/model_request.cc
@@ -13,12 +13,14 @@
 
 namespace turbomind {
 
-ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim):
+ModelRequest::ModelRequest(
+    Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size):
     gateway_{gateway},
     data_type_{data_type},
     session_len_{session_len},
     vocab_size_{vocab_size},
-    hidden_dim_{hidden_dim}
+    hidden_dim_{hidden_dim},
+    tp_size_{tp_size}
 {
 }
 
@@ -127,8 +129,14 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     r->output_ids      = outputs_->at("output_ids");
     r->sequence_length = outputs_->at("sequence_length");
 
+    r->matchers.clear();
     if (grammar_) {
-        r->matcher = std::make_shared<xgrammar::GrammarMatcher>(*grammar_);
+        for (int i = 0; i < tp_size_; ++i) {
+            r->matchers.push_back(std::make_shared<xgrammar::GrammarMatcher>(*grammar_));
+        }
+    }
+    else {
+        r->matchers.resize(tp_size_);
     }
 
     // Keep a weak reference for canceling the request
diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h
index 7582163095..b05a980312 100644
--- a/src/turbomind/engine/model_request.h
+++ b/src/turbomind/engine/model_request.h
@@ -15,7 +15,7 @@ class ModelRequest {
 public:
     virtual ~ModelRequest() = default;
 
-    ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim);
+    ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim, int tp_size);
 
     // Cancel running request
     void Cancel();
@@ -50,6 +50,7 @@ class ModelRequest {
     const int session_len_;
     const int hidden_dim_;
     const int vocab_size_;
+    const int tp_size_;
 
     uint64_t session_id_;
 
diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h
index aa50a48100..a5661bf82c 100644
--- a/src/turbomind/engine/request.h
+++ b/src/turbomind/engine/request.h
@@ -154,7 +154,7 @@ struct Request {
         kInconsistency = 9,  // Inconsistent request parameters, e.g. prefix caching is not allowed in interactive mode
     };
 
-    std::shared_ptr<xgrammar::GrammarMatcher> matcher;
+    std::vector<std::shared_ptr<xgrammar::GrammarMatcher>> matchers;  // GrammarMatchers for different threads (tp_size)
 };
 
 inline void UpdateState(Request& r, int status, int seq_len)
diff --git a/src/turbomind/layers/BaseDynamicDecodeLayer.h b/src/turbomind/layers/BaseDynamicDecodeLayer.h
index a3e14407ff..239b8bc78d 100644
--- a/src/turbomind/layers/BaseDynamicDecodeLayer.h
+++ b/src/turbomind/layers/BaseDynamicDecodeLayer.h
@@ -31,6 +31,7 @@ class BaseDynamicDecodeLayer {
         int                   vocab_size_padded;
         cudaStream_t          stream;
         const cudaDeviceProp* device_prop;
+        int                   tp_rank;
     };
 
     virtual ~BaseDynamicDecodeLayer() = default;
@@ -42,6 +43,7 @@ class BaseDynamicDecodeLayer {
         vocab_size_padded_ = param.vocab_size_padded;
         stream_            = param.stream;
         device_prop_       = param.device_prop;
+        tp_rank_           = param.tp_rank;
     };
 
     virtual void Setup(const std::vector<const Request*>& rs, const TensorMap& args) = 0;
@@ -54,6 +56,7 @@ class BaseDynamicDecodeLayer {
     int                   vocab_size_padded_;
     cudaStream_t          stream_;
     const cudaDeviceProp* device_prop_;
+    int                   tp_rank_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc
index 5a66bf1fb6..11a54cb6aa 100644
--- a/src/turbomind/layers/DynamicDecodeLayer.cc
+++ b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -31,11 +31,14 @@ DynamicDecodeLayer::DynamicDecodeLayer(DataType              dtype,
                                        int                   vocab_size,
                                        int                   vocab_size_padded,
                                        cudaStream_t          stream,
-                                       const cudaDeviceProp* device_prop)
+                                       const cudaDeviceProp* device_prop,
+                                       int                   tp_rank):
+    tp_rank_{tp_rank}
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
     TM_CHECK(dtype == kFloat32);
-    BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
+    BaseDynamicDecodeLayer::BaseParam param{
+        max_batch_size, vocab_size, vocab_size_padded, stream, device_prop, tp_rank};
     layers_.emplace_back(new LogitsProcessorLayer<float>{param});
     layers_.emplace_back(new GuidedDecodeMaskLayer<float>{param});
     layers_.emplace_back(new SamplingLayer<float>{param});
diff --git a/src/turbomind/layers/DynamicDecodeLayer.h b/src/turbomind/layers/DynamicDecodeLayer.h
index c527ff8e0f..233f7f6f9b 100644
--- a/src/turbomind/layers/DynamicDecodeLayer.h
+++ b/src/turbomind/layers/DynamicDecodeLayer.h
@@ -33,7 +33,8 @@ class DynamicDecodeLayer {
                        int                   vocab_size,
                        int                   vocab_size_padded,
                        cudaStream_t          stream,
-                       const cudaDeviceProp* device_prop);
+                       const cudaDeviceProp* device_prop,
+                       int                   tp_rank);
 
     ~DynamicDecodeLayer();
 
@@ -42,6 +43,7 @@ class DynamicDecodeLayer {
     void Forward(TensorMap& args);
 
 private:
+    int                                                  tp_rank_;
     std::vector<std::unique_ptr<BaseDynamicDecodeLayer>> layers_;
 };
 
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc
index 2262992902..2371fe56c0 100644
--- a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc
+++ b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc
@@ -33,7 +33,7 @@ void GuidedDecodeMaskLayer<T>::Setup(const std::vector<const Request*>& rs, cons
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     matchers_.clear();
     for (const auto& r : rs) {
-        matchers_.push_back(r->matcher);
+        matchers_.push_back(r->matchers[tp_rank_]);
     }
 }
 
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc
index 653a8874d8..25c48e3f23 100644
--- a/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc
+++ b/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.h"
+#include "src/turbomind/core/context.h"
 
 namespace turbomind {
 
@@ -29,7 +30,7 @@ void GuidedDecodeUpdateLayer<T>::Setup(const std::vector<const Request*>& rs, co
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     matchers_.clear();
     for (const auto& r : rs) {
-        matchers_.push_back(r->matcher);
+        matchers_.push_back(r->matchers[tp_rank_]);
     }
 }
 
@@ -45,6 +46,7 @@ void GuidedDecodeUpdateLayer<T>::Forward(TensorMap& args)
 
     FT_CHECK(bsz == matchers_.size());
     Copy(output_ids.slice(step * bsz, bsz), output_ids_buf);
+    core::Context::stream().Sync();
 
     for (size_t i = 0; i < bsz; ++i) {
         const auto& matcher = matchers_[i];
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 68185eac38..729f9b380a 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType                     dtype,
 
     // using float to avoid data overflow
     dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
-        kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
+        kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop, engine.mlp_tp_rank);
 }
 
 void LlamaV2::updateEmbedding(char*            decoder_input,
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 853b0a96d8..841e538fb4 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -454,8 +454,12 @@ std::unique_ptr<ModelRequest> LlamaTritonModel::createModelInstance(int device_i
 {
     FT_CHECK(engines_[device_id] != nullptr);
 
-    return std::make_unique<ModelRequest>(
-        gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units);
+    return std::make_unique<ModelRequest>(gateway_.get(),
+                                          dtype_,
+                                          engine_param_.session_len,
+                                          model_param_.vocab_size,
+                                          model_param_.hidden_units,
+                                          comm_size_);
 }
 
 void LlamaTritonModel::createSharedWeights(int device_id, int rank)