@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
46
46
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
47
47
"""
48
48
seq_len = self .seq_len or self .llm_config .max_seq_len
49
- c = self .llm_config
49
+ llm_config = self .llm_config
50
50
kv_cache_dtype_bytes = QUANT_MAPPING .get (
51
- c .weight_dtype , 2
51
+ llm_config .weight_dtype , 2
52
52
) # vLLM uses model's weight applied to KV cache
53
53
54
54
total_bytes = (
55
55
self .batch_size
56
- * c .num_hidden_layers
56
+ * llm_config .num_hidden_layers
57
57
* 2
58
- * c .num_attention_heads
58
+ * llm_config .num_attention_heads
59
59
* seq_len
60
- * c .head_dim
60
+ * llm_config .head_dim
61
61
* kv_cache_dtype_bytes
62
62
)
63
63
return total_bytes / 1e9
@@ -69,15 +69,15 @@ def model_memory(self) -> float:
69
69
70
70
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
71
71
"""
72
- c = self .llm_config
73
- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
72
+ llm_config = self .llm_config
73
+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
74
74
embedding_params = (
75
- embedding_count * c .vocab_size * c .hidden_size
75
+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
76
76
) # input and output untied
77
- layer_params = 12 * c .num_hidden_layers * (c .hidden_size ** 2 ) # GPT-style
77
+ layer_params = 12 * llm_config .num_hidden_layers * (llm_config .hidden_size ** 2 ) # GPT-style
78
78
num_params = layer_params + embedding_params
79
79
80
- return num_params * c .bytes_per_parameter / 1e9
80
+ return num_params * llm_config .bytes_per_parameter / 1e9
81
81
82
82
@property
83
83
def total_memory (self ) -> float :
@@ -120,19 +120,19 @@ def construct_deployment_params(self) -> str:
120
120
-------
121
121
str: Parameter string for model deployment.
122
122
"""
123
- c = self .llm_config
123
+ llm_config = self .llm_config
124
124
params = []
125
- if self .seq_len < c .max_seq_len :
125
+ if self .seq_len < llm_config .max_seq_len :
126
126
params .append (VLLM_PARAMS ["max_model_len" ])
127
127
params .append (str (self .seq_len ))
128
128
129
129
# Only suggest in-flight quantization for unquantized models when such quantization is requested
130
- if not c .quantization and c .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
130
+ if not llm_config .quantization and llm_config .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
131
131
# vLLM only supports 4bit in-flight quantization
132
132
params .append (VLLM_PARAMS ["in_flight_quant" ])
133
133
134
134
# add trust-remote-code if custom modules are specified
135
- if c .trust_remote_code :
135
+ if llm_config .trust_remote_code :
136
136
params .append (VLLM_PARAMS ["trust_remote_code" ])
137
137
138
138
params = " " .join (params ) if params else ""
@@ -158,12 +158,12 @@ def suggest_param_advice(self, allowed: float) -> str:
158
158
wt_gb = self .model_memory
159
159
batch_size = self .batch_size
160
160
seq_len = self .seq_len
161
- weight_size = getattr ( self .llm_config , " weight_dtype" , "unknown" )
161
+ weight_size = self .llm_config . weight_dtype
162
162
config = self .llm_config
163
163
164
164
suggested_quant_msg = None
165
165
quant_advice = ", " .join (config .suggested_quantizations )
166
- quantization = getattr ( config , " quantization" , None )
166
+ quantization = config . quantization
167
167
168
168
advice = []
169
169
@@ -272,22 +272,22 @@ def model_memory(self) -> float:
272
272
Returns estimated model parameter memory (in GB), accurately accounting
273
273
for Llama-style attention and MLP, and tied or untied embeddings.
274
274
"""
275
- c = self .llm_config
275
+ llm_config = self .llm_config
276
276
277
277
embedding_params , attn_params = self ._calc_attn_embed_params ()
278
278
279
279
# MLP params
280
- gate_proj = c .hidden_size * c .intermediate_size
281
- up_proj = c .hidden_size * c .intermediate_size
282
- down_proj = c .intermediate_size * c .hidden_size
280
+ gate_proj = llm_config .hidden_size * llm_config .intermediate_size
281
+ up_proj = llm_config .hidden_size * llm_config .intermediate_size
282
+ down_proj = llm_config .intermediate_size * llm_config .hidden_size
283
283
mlp_params = gate_proj + up_proj + down_proj
284
284
285
285
# Total per-layer
286
286
layer_params = attn_params + mlp_params
287
287
# Total params
288
- num_params = c .num_hidden_layers * layer_params + embedding_params
288
+ num_params = llm_config .num_hidden_layers * layer_params + embedding_params
289
289
290
- return num_params * c .bytes_per_parameter / 1e9
290
+ return num_params * llm_config .bytes_per_parameter / 1e9
291
291
292
292
@property
293
293
def kv_cache_memory (self ) -> float :
@@ -297,18 +297,18 @@ def kv_cache_memory(self) -> float:
297
297
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
298
298
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
299
299
"""
300
- c = self .llm_config
301
- seq_len = self .seq_len or getattr ( c , " max_seq_len" , 2048 )
302
- kv_cache_dtype_bytes = QUANT_MAPPING .get (c .weight_dtype , 2 )
303
- kv_heads = c .num_key_value_heads
300
+ llm_config = self .llm_config
301
+ seq_len = self .seq_len or llm_config . max_seq_len
302
+ kv_cache_dtype_bytes = QUANT_MAPPING .get (llm_config .weight_dtype , 2 )
303
+ kv_heads = llm_config .num_key_value_heads
304
304
305
305
total_bytes = (
306
306
self .batch_size
307
- * c .num_hidden_layers
307
+ * llm_config .num_hidden_layers
308
308
* 2
309
309
* kv_heads
310
310
* seq_len
311
- * c .head_dim
311
+ * llm_config .head_dim
312
312
* kv_cache_dtype_bytes
313
313
)
314
314
return total_bytes / 1e9
@@ -317,17 +317,17 @@ def _calc_attn_embed_params(self) -> tuple:
317
317
"""
318
318
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
319
319
"""
320
- c = self .llm_config
320
+ llm_config = self .llm_config
321
321
322
322
# Embedding parameters
323
323
# assume tied embeddings unless tie_word_embeddings = False
324
- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
325
- embedding_params = embedding_count * c .vocab_size * c .hidden_size
324
+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
325
+ embedding_params = embedding_count * llm_config .vocab_size * llm_config .hidden_size
326
326
327
- q_proj = c .hidden_size * c .hidden_size
328
- k_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
329
- v_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
330
- o_proj = c .hidden_size * c .hidden_size
327
+ q_proj = llm_config .hidden_size * llm_config .hidden_size
328
+ k_proj = llm_config .hidden_size * (llm_config .num_key_value_heads * llm_config .head_dim )
329
+ v_proj = llm_config .hidden_size * (llm_config .num_key_value_heads * llm_config .head_dim )
330
+ o_proj = llm_config .hidden_size * llm_config .hidden_size
331
331
attn_params = q_proj + k_proj + v_proj + o_proj
332
332
333
333
return embedding_params , attn_params
@@ -346,21 +346,21 @@ def model_memory(self) -> float:
346
346
347
347
Returns the estimated memory size of the MoE Model (in GB).
348
348
"""
349
- c = self .llm_config
349
+ llm_config = self .llm_config
350
350
# Attention parameter count (Llama-style)
351
351
embedding_params , attn_params = self ._calc_attn_embed_params ()
352
352
353
353
# MoE MLP params per layer
354
354
moe_params_per_layer = (
355
- c .num_local_experts * 3 * c .hidden_size * c .intermediate_size
355
+ llm_config .num_local_experts * 3 * llm_config .hidden_size * llm_config .intermediate_size
356
356
)
357
357
total_params = (
358
- c .num_hidden_layers * (attn_params + moe_params_per_layer )
358
+ llm_config .num_hidden_layers * (attn_params + moe_params_per_layer )
359
359
+ embedding_params
360
360
)
361
361
362
362
# Convert to GB
363
- return total_params * c .bytes_per_parameter / 1e9
363
+ return total_params * llm_config .bytes_per_parameter / 1e9
364
364
365
365
366
366
def get_estimator (llm_config , ** kwargs ) -> MemoryEstimator :
0 commit comments