Skip to content

[AQUA] GPU Shape Recommendation #1221

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions ads/aqua/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ads.aqua.finetuning import AquaFineTuningApp
from ads.aqua.model import AquaModelApp
from ads.aqua.modeldeployment import AquaDeploymentApp
from ads.aqua.shaperecommend.recommend import AquaRecommendApp
from ads.common.utils import LOG_LEVELS


Expand All @@ -29,6 +30,7 @@ class AquaCommand:
fine_tuning = AquaFineTuningApp
deployment = AquaDeploymentApp
evaluation = AquaEvaluationApp
recommend = AquaRecommendApp

def __init__(
self,
Expand Down Expand Up @@ -94,18 +96,20 @@ def _validate_value(flag, value):
"If you intend to chain a function call to the result, please separate the "
"flag and the subsequent function call with separator `-`."
)

@staticmethod
def install():
"""Install ADS Aqua Extension from wheel file. Set enviroment variable `AQUA_EXTENSTION_PATH` to change the wheel file path.

Return
Return
------
int:
Installatation status.
"""
import subprocess

wheel_file_path = os.environ.get("AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl")
status = subprocess.run(f"pip install {wheel_file_path}",shell=True)
return status.check_returncode
wheel_file_path = os.environ.get(
"AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl"
)
status = subprocess.run(f"pip install {wheel_file_path}", shell=True)
return status.check_returncode
2 changes: 2 additions & 0 deletions ads/aqua/extension/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
)
from ads.aqua.extension.evaluation_handler import __handlers__ as __eval_handlers__
from ads.aqua.extension.finetune_handler import __handlers__ as __finetune_handlers__
from ads.aqua.extension.gpu_recommend_handler import __handlers__ as __gpu_handlers__
from ads.aqua.extension.model_handler import __handlers__ as __model_handlers__
from ads.aqua.extension.ui_handler import __handlers__ as __ui_handlers__
from ads.aqua.extension.ui_websocket_handler import __handlers__ as __ws_handlers__
Expand All @@ -24,6 +25,7 @@
+ __ui_handlers__
+ __eval_handlers__
+ __ws_handlers__
+ __gpu_handlers__
)


Expand Down
50 changes: 50 additions & 0 deletions ads/aqua/extension/recommend_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

from tornado.web import HTTPError

from ads.aqua.common.decorator import handle_exceptions
from ads.aqua.extension.base_handler import AquaAPIhandler
from ads.aqua.extension.errors import Errors
from ads.aqua.shaperecommend.recommend import AquaRecommendApp
from ads.config import COMPARTMENT_OCID


class AquaRecommendHandler(AquaAPIhandler):
"""
Handler for Aqua GPU Recommendation REST APIs.

Methods
-------
get(self, id: Union[str, List[str]])
Retrieves a list of AQUA deployments or model info or logs by ID.
post(self, *args, **kwargs)
Obtains the eligible compute shapes that would fit the specifed model, context length, model weights, and quantization level.

Raises
------
HTTPError: For various failure scenarios such as invalid input format, missing data, etc.
"""

@handle_exceptions
def post(self, *args, **kwargs): # noqa: ARG002
"""
Lists the eligible GPU compute shapes for the specifed model.

Returns
-------
List[ComputeShapeSummary]:
The list of the model deployment shapes.
"""
try:
input_data = self.get_json_body()
# input_data["compartment_id"] = self.get_argument("compartment_id", default=COMPARTMENT_OCID)
except Exception as ex:
raise HTTPError(400, Errors.INVALID_INPUT_DATA_FORMAT) from ex

if not input_data:
raise HTTPError(400, Errors.NO_INPUT_DATA)

self.finish(AquaRecommendApp().which_gpu(**input_data))

__handlers__ = [
("gpu-shape-recommendation/?([^/]*)", AquaRecommendHandler),
]
6 changes: 6 additions & 0 deletions ads/aqua/shaperecommend/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env python
# Copyright (c) 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from ads.aqua.shaperecommend.recommend import AquaGPURecommendApp

__all__ = ["AquaGPURecommendApp"]
33 changes: 33 additions & 0 deletions ads/aqua/shaperecommend/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python
# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
aqua.shaperecommend.constants
~~~~~~~~~~~~~~

This module contains constants used in Aqua GPU Recommendation for Models.

LLAMA_REQUIRED_FIELDS refer to fields necessary for calculating model memory for GQA Architecture Models

MOE_REQUIRED_FIELDS refer to fields necessary for Mixture of Experts (MoE) Architecture Models

NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)
"""
LLAMA_REQUIRED_FIELDS = [
"num_hidden_layers", "hidden_size", "num_attention_heads",
"num_key_value_heads", "head_dim", "intermediate_size", "vocab_size"
]

MOE_REQUIRED_FIELDS = LLAMA_REQUIRED_FIELDS + [
"num_local_experts", "intermediate_size"
]

NEXT_QUANT = {
"float32": ["bfloat16", "float16", "int8"],
"bfloat16": ["float16", "int8"],
"float16": ["int8"],
"int8": ["8bit", "4bit (Not Recommended)"],
"8bit": ["4bit (Not Recommended)"],
"4bit": ["No smaller quantization available"]
}
213 changes: 213 additions & 0 deletions ads/aqua/shaperecommend/estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python
# Copyright (c) 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from typing import Optional

from pydantic import BaseModel, Field

from ads.aqua.app import logger
from ads.aqua.shaperecommend.constants import LLAMA_REQUIRED_FIELDS, MOE_REQUIRED_FIELDS
from ads.aqua.shaperecommend.llm_config import LLMConfig


class MemoryEstimator(BaseModel):
"""
The generic estimator for Transformer Architecture models (OPT/ Bloom)
Used as a fallback estimator if model identified is not a MoE or GQA Architecture Model.
Has properties to estimate the KV Cache size, Model size, and total footprint (KV Cache + Model size)
"""

llm_config: LLMConfig = Field(
...,
description="The model's config.json file with the necessary parameters for model size and KV cache estimation/",
)
batch_size: int = (
1 # we assume that estimation for batch sizes are not supported yet
)
seq_len: Optional[int] = Field(
4096, description="The max-seq-len to estimate the size of the KV cache."
)

@property
def kv_cache_memory(self) -> float:
"""
Estimates the KV cache size (in GB) using the LLM config.json parameters.

Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation
"""
seq_len = self.seq_len or self.llm_config.max_seq_len
c = self.llm_config
kv_cache_dtype_bytes = (
c.bytes_per_parameter
) # vLLM uses model's weight/quantization applied to KV cache

total_bytes = (
self.batch_size
* c.num_hidden_layers
* 2
* c.num_attention_heads
* seq_len
* c.head_dim
* kv_cache_dtype_bytes
)
return total_bytes / 1e9

@property
def model_memory(self) -> float:
"""
Estimates the model size (in GB) based on estimating the model parameter size and model weights

Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
"""
c = self.llm_config
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
embedding_params = (
embedding_count * c.vocab_size * c.hidden_size
) # input and output untied
layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
num_params = layer_params + embedding_params

return num_params * c.bytes_per_parameter / 1e9

# @property
# def model_overhead(self) -> float:
# overhead = max(1, math.ceil(0.0 * self.model_memory))
# return overhead

@property
def total_memory(self) -> float:
"""
Computes the total memory footprint of the model (KV cache & model size from estimated parameters)
"""
return self.model_memory + self.kv_cache_memory


# Specialized estimators:
class LlamaMemoryEstimator(MemoryEstimator):
"""
Estimator for GQA-type architectures. Handles tied (memory savings) and untied embeddings,
and uses grouped attention (GQA) for more efficient KV cache memory estimation.

KV cache: Use num_attention_heads (assumes GQA)
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
"""

@property
def model_memory(self) -> float:
"""
Returns estimated model parameter memory (in GB), accurately accounting
for Llama-style attention and MLP, and tied or untied embeddings.
"""
c = self.llm_config

embedding_params, attn_params = self._calc_attn_embed_params()

# MLP params
gate_proj = c.hidden_size * c.intermediate_size
up_proj = c.hidden_size * c.intermediate_size
down_proj = c.intermediate_size * c.hidden_size
mlp_params = gate_proj + up_proj + down_proj

# Total per-layer
layer_params = attn_params + mlp_params
# Total params
num_params = c.num_hidden_layers * layer_params + embedding_params
return num_params * c.bytes_per_parameter / 1e9

@property
def kv_cache_memory(self) -> float:
"""
Returns estimated KV cache memory in GB for GQA models.

Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
"""
c = self.llm_config
seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
kv_cache_dtype_bytes = c.bytes_per_parameter
kv_heads = c.num_key_value_heads

total_bytes = (
self.batch_size
* c.num_hidden_layers
* 2
* kv_heads
* seq_len
* c.head_dim
* kv_cache_dtype_bytes
)
return total_bytes / 1e9

def _calc_attn_embed_params(self) -> tuple:
"""
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
"""
c = self.llm_config

# Embedding parameters
# assume tied embeddings unless tie_word_embeddings = False
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
embedding_params = embedding_count * c.vocab_size * c.hidden_size

q_proj = c.hidden_size * c.hidden_size
k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
o_proj = c.hidden_size * c.hidden_size
attn_params = q_proj + k_proj + v_proj + o_proj

return embedding_params, attn_params


class MixtureMemoryEstimator(LlamaMemoryEstimator):
"""
Estimator for Mixture-of-Experts (MoE) architectures (e.g., Mixtral, MoE Llama).
Adds extra expert parallelism block parameter count to LlamaMemoryEstimator logic.
"""

@property
def model_memory(self) -> float:
"""
Accounts for the increase in model parameters due to additional expert MLP blocks in MoE Models.

Returns the estimated memory size of the MoE Model (in GB).
"""
c = self.llm_config
# Attention parameter count (Llama-style)
embedding_params, attn_params = self._calc_attn_embed_params()

# MoE MLP params per layer
moe_params_per_layer = (
c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
)
total_params = (
c.num_hidden_layers * (attn_params + moe_params_per_layer)
+ embedding_params
)

# Convert to GB
return total_params * c.bytes_per_parameter / 1e9


def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
"""
Extracts the correct estimator based on the defined parameters in the config.json
See constants.py for LLMConfig parameters necessary for specific estimators.
Uses MemoryEstimator as a fallback if parameters needed for GQA and MoE Architectures are missing.

Returns the appropriate MemoryEstimator based on the fields defined by the model's config.json (as represented by LLMConfig).
"""
if all(
hasattr(llm_config, f) and getattr(llm_config, f) is not None
for f in MOE_REQUIRED_FIELDS
):
return MixtureMemoryEstimator(llm_config=llm_config, **kwargs)
elif all(
hasattr(llm_config, f) and getattr(llm_config, f) is not None
for f in LLAMA_REQUIRED_FIELDS
):
return LlamaMemoryEstimator(llm_config=llm_config, **kwargs)
else:
logger.warning(
"Falling back to generic GPT estimator: required fields missing from config.json file in model."
)
return MemoryEstimator(llm_config=llm_config, **kwargs)
Loading
Loading