codewithdark-git
diff --git a/‎quantllm/api/high_level.py
Lines changed: 160 additions & 57 deletions b/‎quantllm/api/high_level.py
Lines changed: 160 additions & 57 deletions
@@ -1,8 +1,67 @@
 from typing import Optional, Dict, Any, Union, Tuple
 import torch
-from transformers import PreTrainedModel, AutoTokenizer, AutoConfig
+from transformers import PreTrainedModel, AutoTokenizer, AutoConfig, BitsAndBytesConfig
 from ..quant.gguf import GGUFQuantizer, SUPPORTED_GGUF_BITS, SUPPORTED_GGUF_TYPES
 from ..utils.logger import logger
+import psutil
+import math
+
+def get_gpu_memory():
+    """Get available GPU memory in GB."""
+    if torch.cuda.is_available():
+        gpu_mem = []
+        for i in range(torch.cuda.device_count()):
+            total = torch.cuda.get_device_properties(i).total_memory / (1024**3)  # Convert to GB
+            allocated = torch.cuda.memory_allocated(i) / (1024**3)
+            gpu_mem.append(total - allocated)
+        return gpu_mem
+    return []
+
+def get_system_memory():
+    """Get available system memory in GB."""
+    return psutil.virtual_memory().available / (1024**3)
+
+def estimate_model_size(model_name: Union[str, PreTrainedModel]) -> float:
+    """Estimate model size in GB."""
+    try:
+        if isinstance(model_name, PreTrainedModel):
+            params = sum(p.numel() for p in model_name.parameters())
+            return (params * 2) / (1024**3)  # Assuming FP16
+        else:
+            config = AutoConfig.from_pretrained(model_name)
+            if hasattr(config, 'num_parameters'):
+                return (config.num_parameters * 2) / (1024**3)  # Assuming FP16
+            elif hasattr(config, 'n_params'):
+                return (config.n_params * 2) / (1024**3)  # Assuming FP16
+            # Estimate based on common architectures
+            elif hasattr(config, 'hidden_size') and hasattr(config, 'num_hidden_layers'):
+                # More accurate estimation for transformer models
+                hidden_size = config.hidden_size
+                num_layers = config.num_hidden_layers
+                vocab_size = config.vocab_size if hasattr(config, 'vocab_size') else 32000
+                
+                # Calculate main components
+                attention_params = 4 * num_layers * hidden_size * hidden_size  # Q,K,V,O matrices
+                ffn_params = 8 * num_layers * hidden_size * hidden_size  # FFN layers
+                embedding_params = vocab_size * hidden_size  # Input embeddings
+                
+                total_params = attention_params + ffn_params + embedding_params
+                return (total_params * 2) / (1024**3)  # Assuming FP16
+            
+            # If no size info available, estimate based on model name
+            if "llama" in model_name.lower():
+                if "7b" in model_name.lower():
+                    return 13.0
+                elif "13b" in model_name.lower():
+                    return 24.0
+                elif "70b" in model_name.lower():
+                    return 130.0
+                elif "3b" in model_name.lower():
+                    return 6.0
+            return 7.0  # Default assumption
+    except Exception as e:
+        logger.log_warning(f"Error estimating model size: {e}. Using default size.")
+        return 7.0  # Default assumption
 
 class QuantLLM:
     """High-level API for GGUF model quantization."""
@@ -80,11 +139,11 @@ def get_recommended_quant_type(
             if model_size_gb <= 2:
                 bits, qtype = (5, "Q5_1") if priority == "quality" else (4, "Q4_K_M")
             elif model_size_gb <= 7:
-                bits, qtype = (4, "Q4_K_M") if priority != "speed" else (4, "Q4_K_S")
+                bits, qtype = (5, "Q5_1") if priority == "quality" else (4, "Q4_K_M")
             elif model_size_gb <= 13:
-                bits, qtype = (3, "Q3_K_M") if priority != "speed" else (3, "Q3_K_S")
+                bits, qtype = (4, "Q4_K_M") if priority != "speed" else (4, "Q4_K_S")
             else:
-                bits, qtype = (2, "Q2_K")
+                bits, qtype = (3, "Q3_K_M")
 
         return bits, qtype
 
@@ -108,10 +167,11 @@ def quantize_from_pretrained(
         offload_state_dict: bool = False,
         torch_dtype: Optional[torch.dtype] = torch.float16,
         auto_device: bool = True,
-        optimize_for: str = "balanced"
+        optimize_for: str = "balanced",
+        cpu_offload: bool = False
     ) -> PreTrainedModel:
         """
-        Quantize a model using GGUF format with BitsAndBytes and Accelerate for efficient loading.
+        Quantize a model using GGUF format with optimized resource handling.
         
         Args:
             model_name: Model identifier or instance
@@ -133,6 +193,7 @@ def quantize_from_pretrained(
             torch_dtype: Default torch dtype
             auto_device: Automatically determine optimal device
             optimize_for: Optimization priority ("speed", "quality", or "balanced")
+            cpu_offload: Whether to use CPU offloading
             
         Returns:
             Quantized model
@@ -145,42 +206,56 @@ def quantize_from_pretrained(
             if quant_type and quant_type not in SUPPORTED_GGUF_TYPES.get(bits, {}):
                 raise ValueError(f"Unsupported quant_type: {quant_type} for {bits} bits")
 
-            # Auto-determine device if requested
-            if auto_device and device is None:
-                if torch.cuda.is_available():
-                    # Check available GPU memory
-                    gpu_mem = torch.cuda.get_device_properties(0).total_memory
-                    model_size = 0
-                    if isinstance(model_name, PreTrainedModel):
-                        model_size = sum(p.numel() * p.element_size() for p in model_name.parameters())
-                    
-                    # If model is too large for GPU, use CPU offloading
-                    if model_size > gpu_mem * 0.7:  # Leave 30% margin
-                        logger.log_info("Model too large for GPU memory. Using CPU offloading.")
+            # Estimate model size and available resources
+            model_size_gb = estimate_model_size(model_name)
+            gpu_mem = get_gpu_memory()
+            system_mem = get_system_memory()
+            
+            logger.log_info(f"Estimated model size: {model_size_gb:.2f} GB")
+            logger.log_info(f"Available GPU memory: {gpu_mem}")
+            logger.log_info(f"Available system memory: {system_mem:.2f} GB")
+            
+            # Auto-configure resources
+            if auto_device:
+                if torch.cuda.is_available() and gpu_mem:
+                    max_gpu_mem = max(gpu_mem)
+                    if model_size_gb * 1.5 > max_gpu_mem:  # Need 1.5x for safe loading
+                        logger.log_info("Insufficient GPU memory. Using CPU offloading.")
                         device = "cpu"
+                        cpu_offload = True
                         device_map = "cpu"
                         max_memory = None
                     else:
                         device = "cuda"
+                        # Calculate memory distribution
+                        if device_map == "auto":
+                            max_memory = {
+                                i: f"{int(mem * 0.8)}GB"  # Use 80% of available memory
+                                for i, mem in enumerate(gpu_mem)
+                            }
+                            max_memory["cpu"] = f"{int(system_mem * 0.5)}GB"  # Use 50% of system RAM
                 else:
                     device = "cpu"
+                    cpu_offload = True
                     device_map = "cpu"
                     max_memory = None
                 logger.log_info(f"Auto-selected device: {device}")
+                
+            # Configure BitsAndBytes for 4-bit quantization
+            if load_in_4bit:
+                compute_dtype = bnb_4bit_compute_dtype or torch.float16
+                bnb_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type=bnb_4bit_quant_type,
+                    bnb_4bit_compute_dtype=compute_dtype,
+                    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+                    llm_int8_enable_fp32_cpu_offload=cpu_offload
+                )
+            else:
+                bnb_config = None
 
-            # If no quant_type specified, use recommended type based on optimization priority
+            # If no quant_type specified, use recommended type
             if not quant_type:
-                if isinstance(model_name, PreTrainedModel):
-                    model_size_gb = sum(p.numel() * p.element_size() for p in model_name.parameters()) / (1024**3)
-                else:
-                    # Estimate model size based on common architectures
-                    config = AutoConfig.from_pretrained(model_name)
-                    params = config.n_params if hasattr(config, 'n_params') else None
-                    if params:
-                        model_size_gb = (params * 2) / (1024**3)  # Assuming FP16
-                    else:
-                        model_size_gb = 7  # Default assumption
-                
                 bits, quant_type = QuantLLM.get_recommended_quant_type(
                     model_size_gb=model_size_gb,
                     priority=optimize_for
@@ -194,17 +269,14 @@ def quantize_from_pretrained(
                 quant_type=quant_type,
                 use_packed=use_packed,
                 device=device,
-                load_in_8bit=load_in_8bit,
-                load_in_4bit=load_in_4bit,
-                bnb_4bit_quant_type=bnb_4bit_quant_type,
-                bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
-                bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+                quantization_config=bnb_config,
                 use_gradient_checkpointing=use_gradient_checkpointing,
                 device_map=device_map,
                 max_memory=max_memory,
                 offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
-                torch_dtype=torch_dtype
+                torch_dtype=torch_dtype,
+                cpu_offload=cpu_offload
             )
 
             return quantizer.model
@@ -223,32 +295,61 @@ def save_quantized_model(
         save_tokenizer: bool = True,
         quant_config: Optional[Dict[str, Any]] = None
     ):
-        """
-        Save a quantized model in GGUF format.
-        
-        Args:
-            model: Quantized model to save
-            output_path: Path to save the model
-            save_tokenizer: Whether to save the tokenizer
-            quant_config: Optional quantization configuration
-        """
+        """Save a quantized model in GGUF format."""
         try:
-            logger.log_info(f"Converting model to GGUF format: {output_path}")
+            logger.log_info("\n" + "="*60)
+            logger.log_info("Starting GGUF Export Process")
+            logger.log_info("="*60)
+            
+            # Log model details
+            total_params = sum(p.numel() for p in model.parameters())
+            model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
+            
+            logger.log_info(f"\nModel Information:")
+            logger.log_info(f"Architecture: {model.config.model_type}")
+            logger.log_info(f"Total Parameters: {total_params:,}")
+            logger.log_info(f"Model Size: {model_size_gb:.2f} GB")
+            
+            # Get quantization info
+            if hasattr(model.config, 'quantization_config'):
+                config_dict = model.config.quantization_config
+                if isinstance(config_dict, BitsAndBytesConfig):
+                    # Handle BitsAndBytesConfig
+                    bits = 4 if config_dict.load_in_4bit else (8 if config_dict.load_in_8bit else 16)
+                    quant_config = {
+                        'bits': bits,
+                        'group_size': 128,  # Default group size
+                        'quant_type': f"Q{bits}_K_M" if bits <= 8 else "F16"
+                    }
+                    logger.log_info(f"\nQuantization Configuration:")
+                    logger.log_info(f"Bits: {bits}")
+                    logger.log_info(f"Quantization Type: {quant_config['quant_type']}")
+                    if config_dict.load_in_4bit:
+                        logger.log_info(f"4-bit Type: {config_dict.bnb_4bit_quant_type}")
+                        logger.log_info(f"Compute dtype: {config_dict.bnb_4bit_compute_dtype}")
+                else:
+                    quant_config = config_dict
 
-            # Get quantization config from model if not provided
-            if not quant_config and hasattr(model.config, 'quantization_config'):
-                quant_config = model.config.quantization_config
+            if not quant_config:
+                logger.log_info("\nUsing default 4-bit quantization settings")
+                quant_config = {
+                    'bits': 4,
+                    'group_size': 128,
+                    'quant_type': "Q4_K_M"
+                }
 
-            # Create quantizer with existing or default config
+            # Create quantizer with config
+            logger.log_info("\nInitializing GGUF quantizer...")
             quantizer = GGUFQuantizer(
                 model_name=model,
-                bits=quant_config.get('bits', 4) if quant_config else 4,
-                group_size=quant_config.get('group_size', 128) if quant_config else 128,
-                quant_type=quant_config.get('quant_type', None) if quant_config else None,
-                use_packed=quant_config.get('use_packed', True) if quant_config else True
+                bits=quant_config['bits'],
+                group_size=quant_config.get('group_size', 128),
+                quant_type=quant_config.get('quant_type'),
+                use_packed=quant_config.get('use_packed', True)
             )
 
             # Convert to GGUF
+            logger.log_info("\nConverting model to GGUF format...")
             quantizer.convert_to_gguf(output_path)
             logger.log_info("GGUF conversion completed successfully")
 
@@ -260,12 +361,14 @@ def save_quantized_model(
                             model.config._name_or_path,
                             trust_remote_code=True
                         )
-                        tokenizer.save_pretrained(output_path)
-                        logger.log_info("Tokenizer saved successfully")
+                        tokenizer_path = output_path.rsplit('.', 1)[0] + "_tokenizer"
+                        tokenizer.save_pretrained(tokenizer_path)
+                        logger.log_info(f"Tokenizer saved to: {tokenizer_path}")
                     except Exception as e:
                         logger.log_warning(f"Failed to save tokenizer: {e}")
 
-            logger.log_info("Model saved successfully")
+            logger.log_info("\nModel export completed successfully!")
+            logger.log_info("="*60)
 
         except Exception as e:
             logger.log_error(f"Failed to save model: {str(e)}")