Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 39 additions & 18 deletions QEfficient/finetune/experimental/core/model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import warnings
Expand All @@ -11,7 +9,7 @@

import torch.nn as nn
import transformers
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoTokenizer

from QEfficient.finetune.experimental.core.component_registry import registry
from QEfficient.finetune.experimental.core.logger import Logger
Expand Down Expand Up @@ -48,7 +46,10 @@ def load_model(self) -> nn.Module:

def load_tokenizer(self) -> Any:
"""Override if the model exposes a tokenizer."""
warnings.warn(f"{type(self).__name__} does not provide a tokenizer.", category=UserWarning)
warnings.warn(
f"{type(self).__name__} does not provide a tokenizer.",
category=UserWarning,
)
return None

# Lazy accessors
Expand Down Expand Up @@ -82,7 +83,10 @@ def eval(self):

@registry.model("hf")
class HFModel(BaseModel):
"""HuggingFace-backed model with optional quantization."""
"""
HuggingFace-backed model

"""

def __init__(
self,
Expand All @@ -105,26 +109,43 @@ def _resolve_auto_class(auto_class_name: str) -> Type:
)
return getattr(transformers, auto_class_name)

# def _build_quant_config(self) -> Optional[BitsAndBytesConfig]:
# if not self.model_kwargs.get("load_in_4bit"):
# return None
# return BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type=self.model_kwargs.get("bnb_4bit_quant_type", "nf4"),
# bnb_4bit_compute_dtype=self.model_kwargs.get("bnb_4bit_compute_dtype", torch.float16),
# bnb_4bit_use_double_quant=self.model_kwargs.get("bnb_4bit_use_double_quant", True),
# )

def configure_model_kwargs(self) -> Dict[str, Any]:
"""Hook for subclasses to tweak HF `.from_pretrained` kwargs."""
"""
Hook for subclasses to tweak HF `.from_pretrained` kwargs.

This method follows HuggingFace transformers patterns:
1. Supports passing `config` object directly (AutoConfig instance)
2. Supports passing config parameters directly as kwargs (e.g., num_hidden_layers=2)

Returns:
Dict[str, Any]: Cleaned kwargs ready for `from_pretrained()`
"""
extra = dict(self.model_kwargs)
# extra["quantization_config"] = self._build_quant_config()

# Handle config parameter (HuggingFace pattern: pass config object directly)
if "config" in extra:
config = extra["config"]
if not isinstance(config, AutoConfig):
raise TypeError(
f"Expected AutoConfig instance, got {type(config)}. "
"Pass an AutoConfig object or use config parameters as kwargs."
)
return extra

return extra

def load_model(self) -> nn.Module:
logger.log_rank_zero(f"Loading HuggingFace model '{self.model_name}' via {self.auto_class.__name__}")
"""
Load HuggingFace model with config support.

Supports loading models with modified configurations following HuggingFace transformers patterns:
1. Direct config parameters (HuggingFace standard)
2. Config object (HuggingFace standard)

Returns:
nn.Module: The loaded model.
"""
logger.log_rank_zero(f"Loading HuggingFace model '{self.model_name}' via {self.auto_class.__name__}")
return self.auto_class.from_pretrained(
self.model_name,
**self.configure_model_kwargs(),
Expand Down
134 changes: 134 additions & 0 deletions QEfficient/finetune/experimental/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,137 @@ def forward(self, x):

assert hasattr(tok, "pad_token_id")
assert m.model.loaded[0] == "hf-name"


# Config Loading Tests - Partial Loading of Meta Llama Model


def test_hfmodel_partial_loading_meta_llama_with_direct_config_params(monkeypatch):
"""
Test partial loading of meta-llama model using direct config parameters.
Loads meta-llama model with reduced layers (2 layers) for faster testing
using direct config parameters (HuggingFace standard pattern).
"""

# Mock model that respects num_hidden_layers parameter
def create_partial_model(name, config=None, num_hidden_layers=None, **kwargs):
model_instance = nn.Module()
if config:
model_instance.config = config
n_layers = config.num_hidden_layers
elif num_hidden_layers:
model_instance.config = mock.Mock()
model_instance.config.num_hidden_layers = num_hidden_layers
model_instance.config.hidden_size = 4096
n_layers = num_hidden_layers
else:
model_instance.config = mock.Mock()
model_instance.config.num_hidden_layers = 32
model_instance.config.hidden_size = 4096
n_layers = 32

model_instance.layers = nn.ModuleList(
[nn.Linear(model_instance.config.hidden_size, model_instance.config.hidden_size) for _ in range(n_layers)]
)
return model_instance

class MockAutoModelForCausalLM:
@classmethod
def from_pretrained(cls, name, config=None, num_hidden_layers=None, **kwargs):
return create_partial_model(name, config=config, num_hidden_layers=num_hidden_layers, **kwargs)

monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.transformers.AutoModelForCausalLM",
MockAutoModelForCausalLM,
raising=False,
)
monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.AutoTokenizer",
type("MockTokenizer", (), {"from_pretrained": lambda *args, **kwargs: mock.Mock(pad_token_id=0)}),
raising=False,
)
monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.insert_pad_token",
lambda tok: None,
raising=False,
)

# Load partial meta-llama model with direct config parameter (2 layers for testing)
partial_model = ComponentFactory.create_model(
"hf",
"meta-llama/Llama-3.2-1B",
num_hidden_layers=2, # Load only 2 layers (partial loading)
)

# Verify partial model was loaded with reduced layers
assert partial_model.model.config.num_hidden_layers == 2
assert len(partial_model.model.layers) == 2
assert partial_model.model.config.hidden_size == 4096


def test_hfmodel_partial_loading_meta_llama_for_fast_testing(monkeypatch):
"""
Test partial loading of meta-llama model for fast testing.
"""

# Mock model that respects num_hidden_layers parameter
def create_partial_model(name, config=None, num_hidden_layers=None, **kwargs):
model_instance = nn.Module()
if config:
model_instance.config = config
n_layers = config.num_hidden_layers
elif num_hidden_layers:
model_instance.config = mock.Mock()
model_instance.config.num_hidden_layers = num_hidden_layers
model_instance.config.hidden_size = 4096
n_layers = num_hidden_layers
else:
model_instance.config = mock.Mock()
model_instance.config.num_hidden_layers = 32
model_instance.config.hidden_size = 4096
n_layers = 32

model_instance.layers = nn.ModuleList(
[nn.Linear(model_instance.config.hidden_size, model_instance.config.hidden_size) for _ in range(n_layers)]
)
# Track parameter count (fewer layers = fewer parameters)
model_instance.param_count = sum(p.numel() for p in model_instance.layers.parameters())
return model_instance

class MockAutoModelForCausalLM:
@classmethod
def from_pretrained(cls, name, config=None, num_hidden_layers=None, **kwargs):
return create_partial_model(name, config=config, num_hidden_layers=num_hidden_layers, **kwargs)

monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.transformers.AutoModelForCausalLM",
MockAutoModelForCausalLM,
raising=False,
)
monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.AutoTokenizer",
type("MockTokenizer", (), {"from_pretrained": lambda *args, **kwargs: mock.Mock(pad_token_id=0)}),
raising=False,
)
monkeypatch.setattr(
"QEfficient.finetune.experimental.core.model.insert_pad_token",
lambda tok: None,
raising=False,
)

# Load partial meta-llama model (2 layers)
test_model = ComponentFactory.create_model("hf", "meta-llama/Llama-3.2-1B", num_hidden_layers=2)

# Verify partial model was loaded with reduced layers
assert test_model.model.config.num_hidden_layers == 2
assert len(test_model.model.layers) == 2

# Verify model still works (can do forward pass)
test_input = torch.randn(1, 10, test_model.model.config.hidden_size)
output = test_model.model.layers[0](test_input)
assert output.shape == test_input.shape

# Verify we can test model functionality with partial model
assert len(test_model.model.layers) == 2
assert test_model.model.config.hidden_size == 4096 # Other config preserved
assert test_model.model.param_count > 0 # Has parameters