Skip to content

Commit bbc0007

Browse files
committed
corrected output files
1 parent 4a2c63a commit bbc0007

File tree

12 files changed

+853
-251
lines changed

12 files changed

+853
-251
lines changed

ads/aqua/modeldeployment/deployment.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,34 +1260,38 @@ def validate_deployment_params(
12601260

12611261
def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
12621262
"""
1263-
For the CLI (set generate_table = True), generates the table (in rich diff) with valid
1263+
For the CLI (set by default, generate_table = True), generates the table (in rich diff) with valid
12641264
GPU deployment shapes for the provided model and configuration.
12651265
12661266
For the API (set generate_table = False), generates the JSON with valid
12671267
GPU deployment shapes for the provided model and configuration.
12681268
1269-
Validates if recommendations are generated, calls method to construct the rich diff
1270-
table with the recommendation data.
1269+
Validates the input and determines whether recommendations are available.
12711270
12721271
Parameters
12731272
----------
1274-
model_ocid : str
1275-
OCID of the model to recommend feasible compute shapes.
1273+
**kwargs
1274+
model_ocid : str
1275+
(Required) The OCID of the model to recommend feasible compute shapes for.
1276+
generate_table : bool, optional
1277+
If True, generate and return a rich-diff table; if False, return a JSON response (default is False).
1278+
compartment_id : str, optional
1279+
The OCID of the user's compartment to use for the recommendation.
12761280
12771281
Returns
12781282
-------
12791283
Table (generate_table = True)
1280-
A table format for the recommendation report with compatible deployment shapes
1281-
or troubleshooting info citing the largest shapes if no shape is suitable.
1284+
If `generate_table` is True, a table displaying the recommendation report with compatible deployment shapes,
1285+
or troubleshooting info if no shape is suitable.
12821286
12831287
ShapeRecommendationReport (generate_table = False)
1284-
A recommendation report with compatible deployment shapes, or troubleshooting info
1285-
citing the largest shapes if no shape is suitable.
1288+
If `generate_table` is False, a structured recommendation report with compatible deployment shapes,
1289+
or troubleshooting info and citing the largest shapes if no shape is suitable.
12861290
12871291
Raises
12881292
------
12891293
AquaValueError
1290-
If model type is unsupported by tool (no recommendation report generated)
1294+
If the model type is unsupported and no recommendation report can be generated.
12911295
"""
12921296
deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
12931297
kwargs["deployment_config"] = deployment_config

ads/aqua/shaperecommend/constants.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@
3838
"4bit": ["No smaller quantization available"],
3939
}
4040

41+
RUNTIME_WEIGHTS = {
42+
"use_bfloat16" : "bfloat16",
43+
"use_fp16" : "float16",
44+
"use_fp32" : "float32",
45+
"use_int8" : "int8",
46+
"use_int4" : "int4",
47+
"use_bfloat32" : "bfloat32"
48+
}
4149

4250
TEXT_GENERATION = "text_generation"
4351
SAFETENSORS = "safetensors"
@@ -80,9 +88,17 @@
8088

8189
VLLM_PARAMS_KEY = "VLLM_PARAMS"
8290
VLLM_ENV_KEY = "VLLM"
91+
8392
QUANT_FLAG = "--quantization"
93+
WEIGHT_DTYPE_FLAG = "--dtype"
8494
MAX_MODEL_LEN_FLAG = "--max-model-len"
8595

96+
PARAM_FLAG_MAP = {
97+
"--quantization": ("quantization", str),
98+
"--dtype": ("weight_dtype", str),
99+
"--max-model-len": ("max_model_len", int)
100+
}
101+
86102
TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
87103

88104
VLLM_PARAMS = {
@@ -91,7 +107,8 @@
91107
"trust_remote_code": "--trust-remote-code"
92108
}
93109

94-
DEFAULT_WEIGHT_SIZE = "bfloat16"
110+
111+
DEFAULT_WEIGHT_SIZE = "float32"
95112
DEFAULT_MAX_SEQ_LEN = 4096
96113

97114
BITS_AND_BYTES_8BIT = "8bit"

ads/aqua/shaperecommend/estimator.py

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
4646
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
4747
"""
4848
seq_len = self.seq_len or self.llm_config.max_seq_len
49-
c = self.llm_config
49+
llm_config = self.llm_config
5050
kv_cache_dtype_bytes = QUANT_MAPPING.get(
51-
c.weight_dtype, 2
51+
llm_config.weight_dtype, 2
5252
) # vLLM uses model's weight applied to KV cache
5353

5454
total_bytes = (
5555
self.batch_size
56-
* c.num_hidden_layers
56+
* llm_config.num_hidden_layers
5757
* 2
58-
* c.num_attention_heads
58+
* llm_config.num_attention_heads
5959
* seq_len
60-
* c.head_dim
60+
* llm_config.head_dim
6161
* kv_cache_dtype_bytes
6262
)
6363
return total_bytes / 1e9
@@ -69,15 +69,15 @@ def model_memory(self) -> float:
6969
7070
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
7171
"""
72-
c = self.llm_config
73-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
72+
llm_config = self.llm_config
73+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
7474
embedding_params = (
75-
embedding_count * c.vocab_size * c.hidden_size
75+
embedding_count * llm_config.vocab_size * llm_config.hidden_size
7676
) # input and output untied
77-
layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
77+
layer_params = 12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2) # GPT-style
7878
num_params = layer_params + embedding_params
7979

80-
return num_params * c.bytes_per_parameter / 1e9
80+
return num_params * llm_config.bytes_per_parameter / 1e9
8181

8282
@property
8383
def total_memory(self) -> float:
@@ -120,19 +120,19 @@ def construct_deployment_params(self) -> str:
120120
-------
121121
str: Parameter string for model deployment.
122122
"""
123-
c = self.llm_config
123+
llm_config = self.llm_config
124124
params = []
125-
if self.seq_len < c.max_seq_len:
125+
if self.seq_len < llm_config.max_seq_len:
126126
params.append(VLLM_PARAMS["max_model_len"])
127127
params.append(str(self.seq_len))
128128

129129
# Only suggest in-flight quantization for unquantized models when such quantization is requested
130-
if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
130+
if not llm_config.quantization and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
131131
# vLLM only supports 4bit in-flight quantization
132132
params.append(VLLM_PARAMS["in_flight_quant"])
133133

134134
# add trust-remote-code if custom modules are specified
135-
if c.trust_remote_code:
135+
if llm_config.trust_remote_code:
136136
params.append(VLLM_PARAMS["trust_remote_code"])
137137

138138
params = " ".join(params) if params else ""
@@ -158,12 +158,12 @@ def suggest_param_advice(self, allowed: float) -> str:
158158
wt_gb = self.model_memory
159159
batch_size = self.batch_size
160160
seq_len = self.seq_len
161-
weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
161+
weight_size = self.llm_config.weight_dtype
162162
config = self.llm_config
163163

164164
suggested_quant_msg = None
165165
quant_advice = ", ".join(config.suggested_quantizations)
166-
quantization = getattr(config, "quantization", None)
166+
quantization = config.quantization
167167

168168
advice = []
169169

@@ -272,22 +272,22 @@ def model_memory(self) -> float:
272272
Returns estimated model parameter memory (in GB), accurately accounting
273273
for Llama-style attention and MLP, and tied or untied embeddings.
274274
"""
275-
c = self.llm_config
275+
llm_config = self.llm_config
276276

277277
embedding_params, attn_params = self._calc_attn_embed_params()
278278

279279
# MLP params
280-
gate_proj = c.hidden_size * c.intermediate_size
281-
up_proj = c.hidden_size * c.intermediate_size
282-
down_proj = c.intermediate_size * c.hidden_size
280+
gate_proj = llm_config.hidden_size * llm_config.intermediate_size
281+
up_proj = llm_config.hidden_size * llm_config.intermediate_size
282+
down_proj = llm_config.intermediate_size * llm_config.hidden_size
283283
mlp_params = gate_proj + up_proj + down_proj
284284

285285
# Total per-layer
286286
layer_params = attn_params + mlp_params
287287
# Total params
288-
num_params = c.num_hidden_layers * layer_params + embedding_params
288+
num_params = llm_config.num_hidden_layers * layer_params + embedding_params
289289

290-
return num_params * c.bytes_per_parameter / 1e9
290+
return num_params * llm_config.bytes_per_parameter / 1e9
291291

292292
@property
293293
def kv_cache_memory(self) -> float:
@@ -297,18 +297,18 @@ def kv_cache_memory(self) -> float:
297297
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
298298
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
299299
"""
300-
c = self.llm_config
301-
seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
302-
kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
303-
kv_heads = c.num_key_value_heads
300+
llm_config = self.llm_config
301+
seq_len = self.seq_len or llm_config.max_seq_len
302+
kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2)
303+
kv_heads = llm_config.num_key_value_heads
304304

305305
total_bytes = (
306306
self.batch_size
307-
* c.num_hidden_layers
307+
* llm_config.num_hidden_layers
308308
* 2
309309
* kv_heads
310310
* seq_len
311-
* c.head_dim
311+
* llm_config.head_dim
312312
* kv_cache_dtype_bytes
313313
)
314314
return total_bytes / 1e9
@@ -317,17 +317,17 @@ def _calc_attn_embed_params(self) -> tuple:
317317
"""
318318
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
319319
"""
320-
c = self.llm_config
320+
llm_config = self.llm_config
321321

322322
# Embedding parameters
323323
# assume tied embeddings unless tie_word_embeddings = False
324-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
325-
embedding_params = embedding_count * c.vocab_size * c.hidden_size
324+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
325+
embedding_params = embedding_count * llm_config.vocab_size * llm_config.hidden_size
326326

327-
q_proj = c.hidden_size * c.hidden_size
328-
k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
329-
v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
330-
o_proj = c.hidden_size * c.hidden_size
327+
q_proj = llm_config.hidden_size * llm_config.hidden_size
328+
k_proj = llm_config.hidden_size * (llm_config.num_key_value_heads * llm_config.head_dim)
329+
v_proj = llm_config.hidden_size * (llm_config.num_key_value_heads * llm_config.head_dim)
330+
o_proj = llm_config.hidden_size * llm_config.hidden_size
331331
attn_params = q_proj + k_proj + v_proj + o_proj
332332

333333
return embedding_params, attn_params
@@ -346,21 +346,21 @@ def model_memory(self) -> float:
346346
347347
Returns the estimated memory size of the MoE Model (in GB).
348348
"""
349-
c = self.llm_config
349+
llm_config = self.llm_config
350350
# Attention parameter count (Llama-style)
351351
embedding_params, attn_params = self._calc_attn_embed_params()
352352

353353
# MoE MLP params per layer
354354
moe_params_per_layer = (
355-
c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
355+
llm_config.num_local_experts * 3 * llm_config.hidden_size * llm_config.intermediate_size
356356
)
357357
total_params = (
358-
c.num_hidden_layers * (attn_params + moe_params_per_layer)
358+
llm_config.num_hidden_layers * (attn_params + moe_params_per_layer)
359359
+ embedding_params
360360
)
361361

362362
# Convert to GB
363-
return total_params * c.bytes_per_parameter / 1e9
363+
return total_params * llm_config.bytes_per_parameter / 1e9
364364

365365

366366
def get_estimator(llm_config, **kwargs) -> MemoryEstimator:

0 commit comments

Comments
 (0)