transformers serve quantization docs + some api fixes for bitsandbytes (#41253)

SunMarc · MekkCyber · web-flow · commit 81b4f9882c8a · 2025-10-23T16:00:54.000Z
* doc

* fix api

* fix

* fix

* fix

* fix args

* minor doc fix

* fix

* style

* rm check for now

* fix

* style

* Update docs/source/en/serving.md

Co-authored-by: Mohamed Mekkouri &lt;93391238+MekkCyber@users.noreply.github.com&gt;

* add log and update value

---------

Co-authored-by: Mohamed Mekkouri &lt;93391238+MekkCyber@users.noreply.github.com&gt;
diff --git a/docs/source/en/serving.md b/docs/source/en/serving.md
@@ -383,6 +383,30 @@ transformers serve \
   --attn_implementation "sdpa"
 ```
 
+### Quantization
+
+transformers serve is compatible with all [quantization methods](https://huggingface.co/docs/transformers/main/quantization/overview) supported in transformers. Quantization can significantly reduce memory usage and improve inference speed, with two main workflows: pre-quantized models and on-the-fly quantization.
+
+#### Pre-quantized Models
+
+For models that are already quantized (e.g., GPTQ, AWQ, bitsandbytes), simply choose a quantized model name for serving.
+Make sure to install the required libraries listed in the quantization documentation.
+
+> [!TIP]
+> Pre-quantized models generally provide the best balance of performance and accuracy.
+
+#### On the fly quantization
+
+If you want to quantize a model at runtime, you can specify the --quantization flag in the CLI. Note that not all quantization methods support on-the-fly conversion. The full list of supported methods is available in the quantization [overview](https://huggingface.co/docs/transformers/main/quantization/overview). 
+
+Currently, with transformers serve, we only supports some methods: ["bnb-4bit", "bnb-8bit"]
+
+For example, to enable 4-bit quantization with bitsandbytes, you need to pass add `--quantization bnb-4bit`: 
+
+```sh
+transformers serve --quantization bnb-4bit
+```
+
 ### Performance tips
 
 - Use an efficient attention backend when available:
@@ -397,6 +421,4 @@ transformers serve \
 
 - `--dtype {bfloat16|float16}` typically improve throughput and memory use vs. `float32`
 
-- `--load_in_4bit`/`--load_in_8bit` can reduce memory footprint for LoRA setups
-
 - `--force-model <repo_id>` avoids per-request model hints and helps produce stable, repeatable runs
diff --git a/src/transformers/cli/serve.py b/src/transformers/cli/serve.py
@@ -377,14 +377,10 @@ def __init__(
                 help="Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
             ),
         ] = None,
-        load_in_8bit: Annotated[
-            bool, typer.Option(help="Whether to use 8 bit precision for the base model - works only with LoRA.")
-        ] = False,
-        load_in_4bit: Annotated[
-            bool, typer.Option(help="Whether to use 4 bit precision for the base model - works only with LoRA.")
-        ] = False,
-        bnb_4bit_quant_type: Annotated[str, typer.Option(help="Quantization type.")] = "nf4",
-        use_bnb_nested_quant: Annotated[bool, typer.Option(help="Whether to use nested quantization.")] = False,
+        quantization: Annotated[
+            Optional[str],
+            typer.Option(help="Which quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'"),
+        ] = None,
         host: Annotated[str, typer.Option(help="Interface the server will listen to.")] = "localhost",
         port: Annotated[int, typer.Option(help="Port the server will listen to.")] = 8000,
         model_timeout: Annotated[
@@ -424,10 +420,7 @@ def __init__(
         self.dtype = dtype
         self.trust_remote_code = trust_remote_code
         self.attn_implementation = attn_implementation
-        self.load_in_8bit = load_in_8bit
-        self.load_in_4bit = load_in_4bit
-        self.bnb_4bit_quant_type = bnb_4bit_quant_type
-        self.use_bnb_nested_quant = use_bnb_nested_quant
+        self.quantization = quantization
         self.host = host
         self.port = port
         self.model_timeout = model_timeout
@@ -1688,22 +1681,20 @@ def get_quantization_config(self) -> Optional["BitsAndBytesConfig"]:
         Returns:
             `Optional[BitsAndBytesConfig]`: The quantization config.
         """
-        if self.load_in_4bit:
+        if self.quantization == "bnb-4bit":
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
-                # For consistency with model weights, we use the same value as `dtype`
-                bnb_4bit_compute_dtype=self.dtype,
-                bnb_4bit_quant_type=self.bnb_4bit_quant_type,
-                bnb_4bit_use_double_quant=self.use_bnb_nested_quant,
-                bnb_4bit_quant_storage=self.dtype,
-            )
-        elif self.load_in_8bit:
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
             )
+        elif self.quantization == "bnb-8bit":
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
         else:
             quantization_config = None
 
+        if quantization_config is not None:
+            logger.info(f"Quantization applied with the following config: {quantization_config}")
+
         return quantization_config
 
     def process_model_name(self, model_id: str) -> str:
@@ -1750,27 +1741,22 @@ def _load_model_and_data_processor(self, model_id_and_revision: str):
             revision=revision,
             trust_remote_code=self.trust_remote_code,
         )
-
         dtype = self.dtype if self.dtype in ["auto", None] else getattr(torch, self.dtype)
         quantization_config = self.get_quantization_config()
 
         model_kwargs = {
             "revision": revision,
             "attn_implementation": self.attn_implementation,
             "dtype": dtype,
-            "device_map": "auto",
+            "device_map": self.device,
             "trust_remote_code": self.trust_remote_code,
+            "quantization_config": quantization_config,
         }
-        if quantization_config is not None:
-            model_kwargs["quantization_config"] = quantization_config
 
         config = AutoConfig.from_pretrained(model_id, **model_kwargs)
         architecture = getattr(transformers, config.architectures[0])
         model = architecture.from_pretrained(model_id, **model_kwargs)
 
-        if getattr(model, "hf_device_map", None) is None:
-            model = model.to(self.device)
-
         has_default_max_length = (
             model.generation_config.max_new_tokens is None and model.generation_config.max_length == 20
         )