diff --git a/QEfficient/finetune/experimental/core/model.py b/QEfficient/finetune/experimental/core/model.py index 0f087e665..1f758ba6f 100644 --- a/QEfficient/finetune/experimental/core/model.py +++ b/QEfficient/finetune/experimental/core/model.py @@ -1,8 +1,6 @@ # ----------------------------------------------------------------------------- -# # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. # SPDX-License-Identifier: BSD-3-Clause -# # ----------------------------------------------------------------------------- import warnings @@ -11,7 +9,7 @@ import torch.nn as nn import transformers -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoTokenizer from QEfficient.finetune.experimental.core.component_registry import registry from QEfficient.finetune.experimental.core.logger import Logger @@ -48,7 +46,10 @@ def load_model(self) -> nn.Module: def load_tokenizer(self) -> Any: """Override if the model exposes a tokenizer.""" - warnings.warn(f"{type(self).__name__} does not provide a tokenizer.", category=UserWarning) + warnings.warn( + f"{type(self).__name__} does not provide a tokenizer.", + category=UserWarning, + ) return None # Lazy accessors @@ -82,7 +83,10 @@ def eval(self): @registry.model("hf") class HFModel(BaseModel): - """HuggingFace-backed model with optional quantization.""" + """ + HuggingFace-backed model + + """ def __init__( self, @@ -105,26 +109,43 @@ def _resolve_auto_class(auto_class_name: str) -> Type: ) return getattr(transformers, auto_class_name) - # def _build_quant_config(self) -> Optional[BitsAndBytesConfig]: - # if not self.model_kwargs.get("load_in_4bit"): - # return None - # return BitsAndBytesConfig( - # load_in_4bit=True, - # bnb_4bit_quant_type=self.model_kwargs.get("bnb_4bit_quant_type", "nf4"), - # bnb_4bit_compute_dtype=self.model_kwargs.get("bnb_4bit_compute_dtype", torch.float16), - # bnb_4bit_use_double_quant=self.model_kwargs.get("bnb_4bit_use_double_quant", True), - # ) - def configure_model_kwargs(self) -> Dict[str, Any]: - """Hook for subclasses to tweak HF `.from_pretrained` kwargs.""" + """ + Hook for subclasses to tweak HF `.from_pretrained` kwargs. + This method follows HuggingFace transformers patterns: + 1. Supports passing `config` object directly (AutoConfig instance) + 2. Supports passing config parameters directly as kwargs (e.g., num_hidden_layers=2) + + Returns: + Dict[str, Any]: Cleaned kwargs ready for `from_pretrained()` + """ extra = dict(self.model_kwargs) - # extra["quantization_config"] = self._build_quant_config() + + # Handle config parameter (HuggingFace pattern: pass config object directly) + if "config" in extra: + config = extra["config"] + if not isinstance(config, AutoConfig): + raise TypeError( + f"Expected AutoConfig instance, got {type(config)}. " + "Pass an AutoConfig object or use config parameters as kwargs." + ) + return extra + return extra def load_model(self) -> nn.Module: - logger.log_rank_zero(f"Loading HuggingFace model '{self.model_name}' via {self.auto_class.__name__}") + """ + Load HuggingFace model with config support. + Supports loading models with modified configurations following HuggingFace transformers patterns: + 1. Direct config parameters (HuggingFace standard) + 2. Config object (HuggingFace standard) + + Returns: + nn.Module: The loaded model. + """ + logger.log_rank_zero(f"Loading HuggingFace model '{self.model_name}' via {self.auto_class.__name__}") return self.auto_class.from_pretrained( self.model_name, **self.configure_model_kwargs(), diff --git a/QEfficient/finetune/experimental/tests/test_model.py b/QEfficient/finetune/experimental/tests/test_model.py index e83abf389..0098a93c7 100644 --- a/QEfficient/finetune/experimental/tests/test_model.py +++ b/QEfficient/finetune/experimental/tests/test_model.py @@ -134,3 +134,137 @@ def forward(self, x): assert hasattr(tok, "pad_token_id") assert m.model.loaded[0] == "hf-name" + + +# Config Loading Tests - Partial Loading of Meta Llama Model + + +def test_hfmodel_partial_loading_meta_llama_with_direct_config_params(monkeypatch): + """ + Test partial loading of meta-llama model using direct config parameters. + Loads meta-llama model with reduced layers (2 layers) for faster testing + using direct config parameters (HuggingFace standard pattern). + """ + + # Mock model that respects num_hidden_layers parameter + def create_partial_model(name, config=None, num_hidden_layers=None, **kwargs): + model_instance = nn.Module() + if config: + model_instance.config = config + n_layers = config.num_hidden_layers + elif num_hidden_layers: + model_instance.config = mock.Mock() + model_instance.config.num_hidden_layers = num_hidden_layers + model_instance.config.hidden_size = 4096 + n_layers = num_hidden_layers + else: + model_instance.config = mock.Mock() + model_instance.config.num_hidden_layers = 32 + model_instance.config.hidden_size = 4096 + n_layers = 32 + + model_instance.layers = nn.ModuleList( + [nn.Linear(model_instance.config.hidden_size, model_instance.config.hidden_size) for _ in range(n_layers)] + ) + return model_instance + + class MockAutoModelForCausalLM: + @classmethod + def from_pretrained(cls, name, config=None, num_hidden_layers=None, **kwargs): + return create_partial_model(name, config=config, num_hidden_layers=num_hidden_layers, **kwargs) + + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.transformers.AutoModelForCausalLM", + MockAutoModelForCausalLM, + raising=False, + ) + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.AutoTokenizer", + type("MockTokenizer", (), {"from_pretrained": lambda *args, **kwargs: mock.Mock(pad_token_id=0)}), + raising=False, + ) + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.insert_pad_token", + lambda tok: None, + raising=False, + ) + + # Load partial meta-llama model with direct config parameter (2 layers for testing) + partial_model = ComponentFactory.create_model( + "hf", + "meta-llama/Llama-3.2-1B", + num_hidden_layers=2, # Load only 2 layers (partial loading) + ) + + # Verify partial model was loaded with reduced layers + assert partial_model.model.config.num_hidden_layers == 2 + assert len(partial_model.model.layers) == 2 + assert partial_model.model.config.hidden_size == 4096 + + +def test_hfmodel_partial_loading_meta_llama_for_fast_testing(monkeypatch): + """ + Test partial loading of meta-llama model for fast testing. + """ + + # Mock model that respects num_hidden_layers parameter + def create_partial_model(name, config=None, num_hidden_layers=None, **kwargs): + model_instance = nn.Module() + if config: + model_instance.config = config + n_layers = config.num_hidden_layers + elif num_hidden_layers: + model_instance.config = mock.Mock() + model_instance.config.num_hidden_layers = num_hidden_layers + model_instance.config.hidden_size = 4096 + n_layers = num_hidden_layers + else: + model_instance.config = mock.Mock() + model_instance.config.num_hidden_layers = 32 + model_instance.config.hidden_size = 4096 + n_layers = 32 + + model_instance.layers = nn.ModuleList( + [nn.Linear(model_instance.config.hidden_size, model_instance.config.hidden_size) for _ in range(n_layers)] + ) + # Track parameter count (fewer layers = fewer parameters) + model_instance.param_count = sum(p.numel() for p in model_instance.layers.parameters()) + return model_instance + + class MockAutoModelForCausalLM: + @classmethod + def from_pretrained(cls, name, config=None, num_hidden_layers=None, **kwargs): + return create_partial_model(name, config=config, num_hidden_layers=num_hidden_layers, **kwargs) + + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.transformers.AutoModelForCausalLM", + MockAutoModelForCausalLM, + raising=False, + ) + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.AutoTokenizer", + type("MockTokenizer", (), {"from_pretrained": lambda *args, **kwargs: mock.Mock(pad_token_id=0)}), + raising=False, + ) + monkeypatch.setattr( + "QEfficient.finetune.experimental.core.model.insert_pad_token", + lambda tok: None, + raising=False, + ) + + # Load partial meta-llama model (2 layers) + test_model = ComponentFactory.create_model("hf", "meta-llama/Llama-3.2-1B", num_hidden_layers=2) + + # Verify partial model was loaded with reduced layers + assert test_model.model.config.num_hidden_layers == 2 + assert len(test_model.model.layers) == 2 + + # Verify model still works (can do forward pass) + test_input = torch.randn(1, 10, test_model.model.config.hidden_size) + output = test_model.model.layers[0](test_input) + assert output.shape == test_input.shape + + # Verify we can test model functionality with partial model + assert len(test_model.model.layers) == 2 + assert test_model.model.config.hidden_size == 4096 # Other config preserved + assert test_model.model.param_count > 0 # Has parameters