[NVBUG: 5617733] Update LLM generate API for modelopt LLM eval (#498)

cjluo-nv · mxinO · commit 1e5c901c9a0b · 2025-11-10T22:15:43.000-08:00
## What does this PR do?

**Type of change:** ? Bug fix

**Overview:** ?

1) Remove kv_cache_config in the generate API. It's no longer used in
the code as well. We just estimate KV cache usage from other parameters
2) Add max_seq_len in the generate API to better estimate the real KV
cache usage.
3) Assume default lm_eval max input sequence length to be 4096

Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
Signed-off-by: mxin &lt;mxin@nvidia.com&gt;
diff --git a/examples/llm_eval/gen_model_answer.py b/examples/llm_eval/gen_model_answer.py
@@ -181,7 +181,7 @@ def get_model_answers(
     tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
     if checkpoint_dir:
         assert LLM is not None, "tensorrt_llm APIs could not be imported."
-        model = LLM(checkpoint_dir, tokenizer=tokenizer)
+        model = LLM(checkpoint_dir, tokenizer=tokenizer, max_batch_size=1)
     elif not nim_model:
         model, _ = load_model(
             model_path,
diff --git a/examples/llm_eval/lm_eval_tensorrt_llm.py b/examples/llm_eval/lm_eval_tensorrt_llm.py
@@ -30,7 +30,7 @@
 from lm_eval.models.api_models import TemplateAPI
 from transformers import BatchEncoding
 
-from modelopt.deploy.llm.generate import LLM
+from modelopt.deploy.llm import LLM
 
 logger = logging.getLogger(__name__)
 
@@ -58,8 +58,14 @@ def __init__(
 
         assert isinstance(checkpoint_dir, str)
 
-        self.llm = LLM(checkpoint_dir=checkpoint_dir, tokenizer=self.tokenizer)
-        self.max_length = self.llm.max_seq_len - 1
+        max_length = kwargs.get("max_length", self._max_gen_toks + 4096)
+        self.llm = LLM(
+            checkpoint_dir=checkpoint_dir,
+            tokenizer=self.tokenizer,
+            max_batch_size=int(batch_size),
+            max_seq_len=max_length,
+        )
+        self.max_length = max_length - 1
         logger.info("Loaded TRT-LLM")
 
     def model_call(
diff --git a/examples/llm_eval/mmlu.py b/examples/llm_eval/mmlu.py
@@ -259,6 +259,8 @@ def main(
             checkpoint_dir=kwargs["checkpoint_dir"],
             tokenizer=tokenizer,
             medusa_choices=medusa_choices,
+            max_seq_len=MAX_SEQ_LEN,
+            max_batch_size=1,
         )
     else:
         model = select_model(
diff --git a/modelopt/deploy/llm/generate.py b/modelopt/deploy/llm/generate.py
@@ -57,22 +57,21 @@ def __init__(
         self,
         checkpoint_dir: str | Path,
         tokenizer: "str | Path | None" = None,
-        kv_cache_config: dict[str, int | float] = {},
         medusa_choices: Any = None,
         tp: int = 0,
         trust_remote_code: bool = False,
+        max_seq_len: int = 0,
         max_batch_size: int = 0,
     ):
         """Initializes the LLM runner class.
 
         Args:
             checkpoint_dir: the directory path of the model checkpoint.
             tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model.
-            kv_cache_config: the kv cache config as a dict. Please refer to
-                https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/
             medusa_choices: The medusa choices for the decoding config.
             tp: the tensor parallel size (for the torch backend). If 0, it will be set to the number of GPUs.
             trust_remote_code: whether to trust the remote code (for the torch backend).
+            max_seq_len: Max sequence length for the LLM backend. If 0, it is not specified.
             max_batch_size: Max batch size for the LLM backend. If 0, it is not specified.
         """
         with open(Path(checkpoint_dir) / "config.json") as config_file:
@@ -91,14 +90,16 @@ def _find_max_position_embeddings(cfg: dict) -> int | None:
             return None
 
         # Some VLMs may have a sub-config for max_position_embeddings, so we need to find it.
-        self._max_seq_len = _find_max_position_embeddings(config)
-        if self._max_seq_len is None:
-            warnings.warn(
-                "max_position_embeddings not found in config.json, using default value 8192"
-            )
-            self._max_seq_len = 8192
+        if max_seq_len > 0:
+            self._max_seq_len = max_seq_len
         else:
-            print(f"max_position_embeddings: {self._max_seq_len}")
+            self._max_seq_len = _find_max_position_embeddings(config)
+            if self._max_seq_len is None:
+                warnings.warn(
+                    "max_position_embeddings not found in config.json, using default value 8192"
+                )
+                self._max_seq_len = 8192
+        print(f"max_position_embeddings: {self._max_seq_len}")
         self._max_beam_width = 1
 
         kwargs = {}

Original file line number	Diff line number	Diff line change
`@@ -259,6 +259,8 @@ def main(`
`259`	`259`	`checkpoint_dir=kwargs["checkpoint_dir"],`
`260`	`260`	`tokenizer=tokenizer,`
`261`	`261`	`medusa_choices=medusa_choices,`
	`262`	`+ max_seq_len=MAX_SEQ_LEN,`
	`263`	`+ max_batch_size=1,`
`262`	`264`	`)`
`263`	`265`	`else:`
`264`	`266`	`model = select_model(`