Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/nemo_safe_synthesizer/generation/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ In `GenerationBatches.add_batch()`:
## Gotchas

- `_resolve_temperature()` — forces `temperature=0.0` when `do_sample=False`; raises if `do_sample=False` and `temperature > 0`.
- `need_special_token_outputs` — `True` when processor is not `TabularDataProcessor` (VllmBackend) or not `TimeSeriesDataProcessor` (TimeseriesBackend). When True: `skip_special_tokens=False`, `include_stop_str_in_output=True`, `ignore_eos=True`. Needed for GroupedDataProcessor BOS/EOS.
- `need_special_token_outputs` — `True` when processor is not `TabularDataProcessor` (VllmBackend) or not `TimeSeriesDataProcessor` (TimeseriesBackend). When True: `skip_special_tokens=False`, `include_stop_str_in_output=True`. `ignore_eos` is always `False` (native EOS stopping). Needed for GroupedDataProcessor BOS/EOS.
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description of need_special_token_outputs is inaccurate: the stated condition "not TabularDataProcessor ... or not TimeSeriesDataProcessor" would be true for essentially any concrete processor, and it also doesn't match the code. In generation/vllm_backend.py, the condition is need_special_token_outputs = not isinstance(self.processor, TabularDataProcessor), while timeseries_backend.py sets skip_special_tokens=True/include_stop_str_in_output=False unconditionally. Please update this doc line to reflect the actual logic per backend.

Suggested change
- `need_special_token_outputs``True` when processor is not `TabularDataProcessor` (VllmBackend) or not `TimeSeriesDataProcessor` (TimeseriesBackend). When True: `skip_special_tokens=False`, `include_stop_str_in_output=True`. `ignore_eos` is always `False` (native EOS stopping). Needed for GroupedDataProcessor BOS/EOS.
- `need_special_token_outputs`in `VllmBackend`, this is `True` when the processor is not `TabularDataProcessor`, and then we use `skip_special_tokens=False` and `include_stop_str_in_output=True` (needed e.g. for `GroupedDataProcessor` BOS/EOS). In `TimeseriesBackend`, special tokens are always skipped and stop strings are always excluded (`skip_special_tokens=True`, `include_stop_str_in_output=False`), independent of this flag. `ignore_eos` is always `False` (native EOS stopping).

Copilot uses AI. Check for mistakes.
- Timeseries single valid response — `_retain_single_valid_response()` keeps only the response with the most valid records per batch; others trimmed for sliding-window continuity.
- `typical_p` — wrapped as `TypicalLogitsWarperWrapper` (logits processor) because vLLM lacks native typical sampling.
- `num_beams` — mapped to `beam_width` only when `x > 1`; otherwise `(None, None)` (excluded from SamplingParams).
Expand Down
68 changes: 36 additions & 32 deletions src/nemo_safe_synthesizer/llm/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from __future__ import annotations

from pathlib import Path
from typing import ClassVar, Literal
from typing import ClassVar, Literal, cast

from pydantic import (
BaseModel,
Expand All @@ -38,7 +38,7 @@
field_validator,
model_validator,
)
from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
from transformers import AutoConfig, AutoTokenizer, PretrainedConfig, PreTrainedTokenizer

from ..cli.artifact_structure import Workdir
from ..config.parameters import SafeSynthesizerParameters
Expand Down Expand Up @@ -94,7 +94,7 @@ class LLMPromptConfig(BaseModel):
"""Integer id for the EOS token."""

@classmethod
def from_tokenizer(cls, name: str, tokenizer: AutoTokenizer | None = None, **kwargs) -> LLMPromptConfig:
def from_tokenizer(cls, name: str, tokenizer: PreTrainedTokenizer | None = None, **kwargs) -> LLMPromptConfig:
"""Create a prompt config by reading from settings of a tokenizer.

If no ``tokenizer`` is supplied one is loaded from ``name``
Expand All @@ -111,11 +111,13 @@ def from_tokenizer(cls, name: str, tokenizer: AutoTokenizer | None = None, **kwa
Returns:
A new ``LLMPromptConfig`` populated from the tokenizer.
"""
tokenizer = tokenizer or AutoTokenizer.from_pretrained(name)
bos_token = kwargs.get("bos_token", getattr(tokenizer, "bos_token", None))
bos_token_id = kwargs.get("bos_token_id", getattr(tokenizer, "bos_token_id", None))
eos_token = kwargs.get("eos_token", getattr(tokenizer, "eos_token", None))
eos_token_id = kwargs.get("eos_token_id", getattr(tokenizer, "eos_token_id", None))
_tokenizer: PreTrainedTokenizer = cast(
PreTrainedTokenizer, AutoTokenizer.from_pretrained(name) if tokenizer is None else tokenizer
)
bos_token = kwargs.get("bos_token", getattr(_tokenizer, "bos_token", None))
bos_token_id = kwargs.get("bos_token_id", getattr(_tokenizer, "bos_token_id", None))
eos_token = kwargs.get("eos_token", getattr(_tokenizer, "eos_token", None))
eos_token_id = kwargs.get("eos_token_id", getattr(_tokenizer, "eos_token_id", None))
template = kwargs.get("template", PROMPT_TEMPLATE)
add_bos_token_to_prompt = kwargs.get("add_bos_token_to_prompt", True)
add_eos_token_to_prompt = kwargs.get("add_eos_token_to_prompt", True)
Expand Down Expand Up @@ -547,7 +549,7 @@ class Granite(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
Expand All @@ -559,6 +561,7 @@ def __init__(
instruction=DEFAULT_INSTRUCTION,
prompt_config=LLMPromptConfig.from_tokenizer(
name=model_name_or_path,
tokenizer=tokenizer,
template="user\n {instruction} {schema} \n assistant\n{prefill}",
add_bos_token_to_prompt=False,
add_eos_token_to_prompt=True,
Expand All @@ -573,8 +576,8 @@ def __init__(
class Llama32(ModelMetadata):
"""Metadata for Meta Llama 3.2 model family.

Uses ``<|im_start|>`` (id 151644) as the BOS token and disables
automatic BOS/EOS injection in prompts.
Uses ``<|im_start|>`` as the BOS token and disables automatic
BOS/EOS injection in prompts.

Args:
model_name_or_path: HuggingFace model identifier or local path.
Expand All @@ -586,21 +589,23 @@ class Llama32(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer

im_start_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
super().__init__(
autoconfig=config,
instruction=DEFAULT_INSTRUCTION,
prompt_config=LLMPromptConfig.from_tokenizer(
name=model_name_or_path,
tokenizer=tokenizer,
template="user\n {instruction} {schema} \n assistant\n{prefill}",
bos_token="<|im_start|>",
bos_token_id=151644,
bos_token_id=im_start_id,
add_bos_token_to_prompt=False,
add_eos_token_to_prompt=False,
),
Expand Down Expand Up @@ -629,11 +634,11 @@ class Mistral(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer: AutoTokenizer | None = None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path)
if rope_scaling_factor:
logger.warning(
Expand All @@ -646,6 +651,7 @@ def __init__(
instruction=DEFAULT_INSTRUCTION,
prompt_config=LLMPromptConfig.from_tokenizer(
name=model_name_or_path,
tokenizer=tokenizer,
template=template,
add_bos_token_to_prompt=True,
add_eos_token_to_prompt=True,
Expand All @@ -670,11 +676,11 @@ class Nemotron(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path)

super().__init__(
Expand Down Expand Up @@ -707,7 +713,7 @@ class Qwen(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
Expand Down Expand Up @@ -748,7 +754,7 @@ class SmolLM2(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
Expand All @@ -759,6 +765,7 @@ def __init__(
f"Rope scaling factor {rope_scaling_factor} is not supported for SmolLM2 due to longer default context lengths. Ignoring."
)

im_start_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
super().__init__(
autoconfig=config,
instruction=DEFAULT_INSTRUCTION,
Expand All @@ -768,7 +775,7 @@ def __init__(
add_eos_token_to_prompt=False,
tokenizer=tokenizer,
bos_token="<|im_start|>",
bos_token_id=151644,
bos_token_id=im_start_id,
name=model_name_or_path,
),
model_name_or_path=model_name_or_path,
Expand All @@ -781,9 +788,9 @@ def __init__(
class SmolLM3(ModelMetadata):
"""Metadata for HuggingFace SmolLM3 model family.

Uses ``<|im_start|>`` (id 128011) as the BOS token. RoPE scaling
is not supported. Any supplied ``rope_scaling_factor`` will be
ignored with a warning.
Uses ``<|im_start|>`` as the BOS token. RoPE scaling is not
supported. Any supplied ``rope_scaling_factor`` will be ignored
with a warning.

Args:
model_name_or_path: HuggingFace model identifier or local path.
Expand All @@ -795,19 +802,16 @@ class SmolLM3(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
config = AutoConfig.from_pretrained(model_name_or_path)

# we use the bos token here explicitly for support during group-by SFT.
# the groupby assumes there is a bos token at the start of the prompt.
bos_token = "<|im_start|>"
bos_token_id = 128011
# Group-by SFT assumes a BOS token at the start of the prompt.
im_start_id = tokenizer.convert_tokens_to_ids("<|im_start|>")

# SmolLM3 uses high theta values (1.5M-5M) so it's important to read from config
if rope_scaling_factor:
logger.warning(
f"Rope scaling factor {rope_scaling_factor} is not supported for SmolLM3 due to longer default context lengths. Ignoring."
Expand All @@ -822,8 +826,8 @@ def __init__(
add_eos_token_to_prompt=True,
tokenizer=tokenizer,
name=model_name_or_path,
bos_token=bos_token,
bos_token_id=bos_token_id,
bos_token="<|im_start|>",
bos_token_id=im_start_id,
),
model_name_or_path=model_name_or_path,
rope_scaling=None,
Expand All @@ -845,7 +849,7 @@ class TinyLlama(ModelMetadata):
def __init__(
self,
model_name_or_path: str,
tokenizer=None,
tokenizer: PreTrainedTokenizer | None = None,
rope_scaling_factor: float | None = None,
**kwargs,
) -> None:
Expand Down
8 changes: 5 additions & 3 deletions tests/llm/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class RopeScalingScenario:
expected_add_bos=False,
expected_add_eos=False,
expected_bos_token="<|im_start|>",
expected_bos_token_id=151644,
expected_bos_token_id=1, # this is the token_id that the mock_tockenizer injects, not what a real model would use
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in the inline comment: "mock_tockenizer" should be "mock_tokenizer" to match the fixture name and avoid confusion when reading the test expectations.

Suggested change
expected_bos_token_id=1, # this is the token_id that the mock_tockenizer injects, not what a real model would use
expected_bos_token_id=1, # this is the token_id that the mock_tokenizer injects, not what a real model would use

Copilot uses AI. Check for mistakes.
),
ModelInitScenario(
id="smollm2",
Expand All @@ -134,7 +134,7 @@ class RopeScalingScenario:
expected_add_bos=False,
expected_add_eos=False,
expected_bos_token="<|im_start|>",
expected_bos_token_id=151644,
expected_bos_token_id=1,
custom_max_position_embeddings=8192,
),
ModelInitScenario(
Expand All @@ -145,7 +145,7 @@ class RopeScalingScenario:
expected_add_bos=True,
expected_add_eos=True,
expected_bos_token="<|im_start|>",
expected_bos_token_id=128011,
expected_bos_token_id=1,
use_global_max_seq=True,
),
ModelInitScenario(
Expand Down Expand Up @@ -297,6 +297,8 @@ def mock_tokenizer():
tokenizer.bos_token_id = 1
tokenizer.eos_token = "</s>"
tokenizer.eos_token_id = 2
_token_ids = {"<|im_start|>": 1, "<|im_end|>": 2}
tokenizer.convert_tokens_to_ids = lambda tok: _token_ids.get(tok, 0)
return tokenizer


Expand Down
Loading