From 1523880520cce63247bc0bb0f3ce9414aaef7131 Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 12 Sep 2024 12:39:59 +0200 Subject: [PATCH] device_map defaults to auto (#607) --- README.md | 2 +- awq/models/auto.py | 2 +- awq/models/base.py | 2 +- docs/examples.md | 15 ++++++++------- examples/cli.py | 2 +- examples/quantize.py | 2 +- scripts/runpod_quantize.py | 1 + 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 56d02a0f..c3b2757c 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/awq/models/auto.py b/awq/models/auto.py index 3a6416f1..af2580d5 100644 --- a/awq/models/auto.py +++ b/awq/models/auto.py @@ -60,7 +60,7 @@ def from_pretrained( model_path, trust_remote_code=True, safetensors=True, - device_map=None, + device_map="auto", download_kwargs=None, **model_init_kwargs, ) -> BaseAWQForCausalLM: diff --git a/awq/models/base.py b/awq/models/base.py index 475e48bb..a76bb293 100644 --- a/awq/models/base.py +++ b/awq/models/base.py @@ -347,7 +347,7 @@ def from_pretrained( Doc( "A device map that will be passed onto the model loading method from transformers." ), - ] = None, + ] = "auto", download_kwargs: Annotated[ Dict, Doc("Used for configure download model"), diff --git a/docs/examples.md b/docs/examples.md index 2603922a..86983dd3 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -21,7 +21,7 @@ quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -50,7 +50,9 @@ quant_path = 'vicuna-7b-v1.5-awq' quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # Load model -model = AutoAWQForCausalLM.from_pretrained(model_path) +model = AutoAWQForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, use_cache=False, device_map="cuda", +) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Define data loading methods @@ -106,7 +108,7 @@ quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False, device_map="cuda", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -149,7 +151,7 @@ quant_config = { "zero_point": True, "q_group_size": 64, "w_bit": 4, "version": # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False, device_map="cuda", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -195,7 +197,7 @@ quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, device_map="cuda", **{"low_cpu_mem_usage": True} + model_path, low_cpu_mem_usage=True, device_map="cuda", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -234,9 +236,8 @@ llama_cpp_path = '/workspace/llama.cpp' quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 6, "version": "GEMM" } # Load model -# NOTE: pass safetensors=True to load safetensors model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False, device_map="cuda", ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/examples/cli.py b/examples/cli.py index a52d1193..f7f7a7b6 100644 --- a/examples/cli.py +++ b/examples/cli.py @@ -20,7 +20,7 @@ def main(): parser.add_argument("--no-low_cpu_mem_usage", action="store_false", dest="low_cpu_mem_usage", help="Don't use low CPU memory") parser.add_argument("--use_cache", action="store_true", help="Use cache") parser.add_argument("--no-use_cache", action="store_false", dest="use_cache", help="Don't use cache") - parser.add_argument("--device_map", type=str, default=None, help="Device map for loading the pretrained model") + parser.add_argument("--device_map", type=str, default="auto", help="Device map for loading the pretrained model") parser.set_defaults(zero_point=True, low_cpu_mem_usage=True, use_cache=None) diff --git a/examples/quantize.py b/examples/quantize.py index 13dbb720..aa45b78a 100644 --- a/examples/quantize.py +++ b/examples/quantize.py @@ -7,7 +7,7 @@ # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, low_cpu_mem_usage=True, use_cache=False ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/scripts/runpod_quantize.py b/scripts/runpod_quantize.py index d036b48f..1fac9acd 100644 --- a/scripts/runpod_quantize.py +++ b/scripts/runpod_quantize.py @@ -47,6 +47,7 @@ version = "GEMM", low_cpu_mem_usage = True, use_cache = False, + device_map = "auto", ) cli_args = " ".join([f"--{k}" if isinstance(v, bool) else f"--{k} {v}" for k,v in cli_args.items()])