Skip to content

Commit 59e3a81

Browse files
Merge branch 'release/1.1' into user/barry/fix_sm100_r1_ci
2 parents 57f5ba5 + b334102 commit 59e3a81

File tree

11 files changed

+53
-791
lines changed

11 files changed

+53
-791
lines changed

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ def forward(
598598
))
599599

600600
# Unpack the allreduce output
601-
if self.next_attn is not None and self.is_nvfp4:
601+
if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
602602
act_fp4, act_sf, residual = allreduce_output
603603
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
604604
else:
@@ -789,7 +789,7 @@ def forward(
789789
scale=scale,
790790
eps=self.next_layer_layernorm.variance_epsilon,
791791
))
792-
if self.next_attn is not None and self.is_nvfp4:
792+
if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
793793
act_fp4, act_sf, residual = all_reduce_output
794794
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
795795
else:

tensorrt_llm/executor/base_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def _deduce_max_tokens(request: GenerationRequest,
432432
# default_max_tokens is the biggest available value
433433
if max_tokens is None:
434434
return default_max_tokens
435-
elif max_tokens > default_max_tokens:
435+
elif max_tokens > default_max_tokens and default_max_tokens > 0:
436436
logger.warning(
437437
f"User-specified `max_tokens` ({max_tokens}) is greater than deduced "
438438
f"`default_max_tokens` ({default_max_tokens}), using default_max_tokens instead."

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -645,15 +645,15 @@ def test_nvfp4_tp4(self):
645645

646646
@pytest.mark.skip_less_device(4)
647647
@skip_pre_blackwell
648-
def test_fp8_tp2pp2(self):
649-
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
648+
def test_fp4_tp2pp2(self):
649+
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
650650
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
651651
with LLM(model_path,
652652
tensor_parallel_size=2,
653653
pipeline_parallel_size=2,
654654
max_batch_size=32,
655655
kv_cache_config=kv_cache_config) as llm:
656-
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
656+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
657657
sampling_params = SamplingParams(
658658
max_tokens=256,
659659
temperature=0.0,

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[
418418
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False]
419419
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True]
420420
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
421-
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
421+
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
422422
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
423423
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
424424
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
117117
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
118118
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
119119
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
120-
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
120+
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
121121
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
122122
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
123123
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]

tests/integration/test_lists/test-db/l0_a30.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ l0_a30:
1919
- unittest/_torch/modeling -k "modeling_qwen"
2020
- unittest/_torch/modeling -k "modeling_qwen_moe"
2121
- unittest/_torch/modeling -k "modeling_out_of_tree"
22-
- unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
22+
- unittest/_torch/auto_deploy/unit/singlegpu
2323
- unittest/_torch/sampler/test_beam_search.py
2424
- test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler]
2525
- test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ l0_b200:
7474
- unittest/_torch/modeling -k "modeling_mixtral"
7575
- unittest/_torch/modeling -k "modeling_gpt_oss"
7676
- unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8
77-
- unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
77+
- unittest/_torch/auto_deploy/unit/singlegpu
7878
- condition:
7979
ranges:
8080
system_gpu_count:

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ l0_dgx_b200:
5151
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
5252
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
5353
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
54-
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
54+
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
5555
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
5656
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
5757
- condition:

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ l0_h100:
3131
- unittest/_torch/modeling -k "modeling_nemotron"
3232
- unittest/_torch/modeling -k "modeling_gemma3"
3333
- unittest/_torch/modeling -k "modeling_gpt_oss"
34-
- unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
3534
- unittest/disaggregated/test_disagg_utils.py
3635
- unittest/disaggregated/test_router.py
3736
- unittest/disaggregated/test_remoteDictionary.py

tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,12 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
446446
"vision_config": {"num_hidden_layers": 2},
447447
},
448448
},
449+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
450+
"llm_models_subdir": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
451+
"model_kwargs": {
452+
"num_hidden_layers": 2,
453+
},
454+
},
449455
}
450456

451457

0 commit comments

Comments
 (0)