Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dlrover/brain/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ class Node(object):
class DefaultResource(object):
WORKER_CPU = 4
WORKER_MEM = 8 * 1024 * 1024

class UnitConvertor(object):
GIB_TO_BYTES = 1024 ** 3
Empty file.
15 changes: 15 additions & 0 deletions dlrover/brain/python/config/manifests/brain-base-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: brain-base-config
namespace: dlrover
data:
config.json: |
{
"customize_worker_resource": {
"cpu": 16,
"memory": 64,
"gpus": 2,
"gpu_type": "huawei.com/Ascend910B3"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: brain-opt-config
namespace: dlrover
data:
config.json: |
{
"opt_config": {
"incremental_memory_factor": 2,
"incremental_gpus_factor": 1,
"reduction_gpus_factor": 1,
"optimize_add_gpus_utilization_threshold": 10,
"optimize_reduce_gpus_utilization_threshold": 5,
"trigger_resource_elastic_time_window": 600,
"min_samples_in_window": 30,
"max_buffer_size": 200,
"vertical_elastic_scale_interval": 180,
"vertical_elastic_opt_interval": 36000,
"vertical_elastic_max_opt_num": 2,
"ckpt_save_status_check_max_wait_time": 120,
"ckpt_save_status_check_interval": 1,
"log_interval_check_nums": 5
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: brain-manual-scale-config
namespace: dlrover
data:
config.json: |
{
"customize_worker_resource": {
"cpu": 16,
"memory": 64,
"gpus": 2,
"gpu_type": "huawei.com/Ascend910B3"
},
"job_names": "gpu-elastic-manual,test2"
}
75 changes: 75 additions & 0 deletions dlrover/brain/python/config/manifests/brain-service-dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
---
apiVersion: v1
kind: Service
metadata:
name: dlrover-brain
namespace: dlrover
spec:
type: NodePort
ports:
- port: 50002
protocol: TCP
targetPort: 50002
nodePort: 30002
selector:
app: dlrover-brain

---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: dlrover-brain
name: dlrover-brain
namespace: dlrover
spec:
replicas: 1
selector:
matchLabels:
app: dlrover-brain
template:
metadata:
labels:
app: dlrover-brain
name: dlrover-brain
namespace: dlrover
spec:
serviceAccountName: dlrover-controller-manager
volumes:
- name: pvc-nas
persistentVolumeClaim:
claimName: elastic-train-pvc-dlrover
containers:
- command:
- /bin/bash
- -c
- (cd /xxx/DLRover && pip install -e . \
&& python -m dlrover.brain.python.server.server -alsologtostderr \
--namespace dlrover --port 50002 2>&1) |
tee /data/logs/brain-logs/brain.log; exit ${PIPESTATUS[0]}
env:
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: TZ
value: Asia/Shanghai
- name: PYTHONPATH
value: $PYTHONPATH:$(pwd)
image: registry.cic.cmbchina.cn/cic-arm/cmb-neotrain-dlrover-brain:master-v0.7.0-hw-20260212
imagePullPolicy: IfNotPresent
name: dlrover-brain
ports:
- containerPort: 50002
protocol: TCP
volumeMounts:
- name: pvc-nas
mountPath: /data
resources:
limits:
cpu: "1"
memory: 1Gi
requests:
cpu: "1"
memory: 1Gi
73 changes: 73 additions & 0 deletions dlrover/brain/python/config/manifests/brain-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
apiVersion: v1
kind: Service
metadata:
name: dlrover-brain
namespace: dlrover
spec:
type: NodePort
ports:
- port: 50002
protocol: TCP
targetPort: 50002
selector:
app: dlrover-brain

---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: dlrover-brain
name: dlrover-brain
namespace: dlrover
spec:
replicas: 1
selector:
matchLabels:
app: dlrover-brain
template:
metadata:
labels:
app: dlrover-brain
name: dlrover-brain
namespace: dlrover
spec:
serviceAccountName: dlrover-controller-manager
volumes:
- name: pvc-nas
persistentVolumeClaim:
claimName: elastic-train-pvc-dlrover
containers:
- command:
- /bin/bash
- -c
- (python -m dlrover.brain.python.server.server -alsologtostderr \
--namespace dlrover --port 50002 2>&1) |
tee /data/logs/brain-logs/brain.log; exit ${PIPESTATUS[0]}
env:
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: TZ
value: Asia/Shanghai
- name: PYTHONPATH
value: $PYTHONPATH:$(pwd)
image: registry.cic.cmbchina.cn/cic-arm/cmb-neotrain-dlrover-brain:master-v0.7.0-hw-20260212
imagePullPolicy: IfNotPresent
name: dlrover-brain
ports:
- containerPort: 50002
protocol: TCP
volumeMounts:
- name: pvc-nas
mountPath: /data
resources:
limits:
cpu: "1"
memory: 1Gi
requests:
cpu: "1"
memory: 1Gi
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@

from dlrover.brain.python.common.job import (
JobMeta,
NodeResource,
OptimizeConfig,
)
from dlrover.brain.python.common.constants import (
UnitConvertor,
)
from dlrover.brain.python.platform.k8s.configmap import ConfigMapReader
from dlrover.python.common.log import default_logger as logger
from typing import Optional, Dict, Any

class BaseOptimizeJobResource:
def __init__(self):
self.current_node_resource = NodeResource()
pass

@staticmethod
def get_name() -> str:
return "BaseOptimizeJobResource"

def generate_node_resource(self, job: JobMeta, conf: OptimizeConfig) -> NodeResource:
configmap_reader = ConfigMapReader(job.namespace, "brain-base-config")
json_data =configmap_reader.read_json_Data()

if not isinstance(json_data, dict):
logger.warning(f"ConfigMap data is not a dictionary, type: {type(json_data)}. Using default empty resource.")
return self.current_node_resource

customize_worker_resource: Optional[Dict[str, Any]] = json_data.get("customize_worker_resource", None)

if not customize_worker_resource or not isinstance(customize_worker_resource, dict):
logger.warning("ConfigMap 'customize_worker_resource' is missing or invalid. Using default empty resource.")
return self.current_node_resource

logger.info(f"ConfigMap customize_worker_resource data: {customize_worker_resource}")

cpu_val = customize_worker_resource.get("cpu")
memory_val = customize_worker_resource.get("memory")
gpu_nums_val = customize_worker_resource.get("gpus")
gpu_type_val = customize_worker_resource.get("gpu_type")

required_fields = {"cpu": cpu_val, "memory": memory_val, "gpus": gpu_nums_val, "gpu_type": gpu_type_val}
missing_fields = [k for k, v in required_fields.items() if v is None]

if missing_fields:
logger.error(f"Missing required fields in customize_worker_resource: {missing_fields}. Using default empty resource.")
return self.current_node_resource

try:
cpu = int(cpu_val)
memory_gib = int(memory_val) # GiB
gpu_nums = int(gpu_nums_val)

if cpu < 0 or memory_gib < 0 or gpu_nums < 0:
raise ValueError("Resource values cannot be negative.")

# unit conversion (GiB -> Bytes)
memory_bytes = memory_gib * UnitConvertor.GIB_TO_BYTES
gpu_type = str(gpu_type_val).strip()
if not gpu_type:
logger.warning("GPU type is empty, defaulting to 'none' or handling as per business logic.")
gpu_type = "none"

except (ValueError, TypeError) as e:
logger.error(f"Invalid data type in customize_worker_resource: {e}. Using default resource.")
return self.current_node_resource
logger.info(
f"Applying custom resources -> CPU: {cpu}, Memory: {memory_gib}GiB, GPUs: {gpu_nums}, Type: {gpu_type}"
)
return NodeResource(
cpu=cpu,
memory=memory_bytes, # GiB → B
gpu=gpu_nums,
gpu_type=gpu_type,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

from dlrover.brain.python.common.job import (
JobMeta,
NodeResource,
OptimizeConfig,
)
from dlrover.brain.python.common.constants import (
UnitConvertor,
)
from dlrover.brain.python.platform.k8s.configmap import ConfigMapReader
from dlrover.brain.python.platform.k8s.monitor import ResourceMonitor
from dlrover.python.common.log import default_logger as logger
from typing import Optional, Dict, Any

class ManualOptimizeJobResource:
def __init__(self):
self.current_node_resource = NodeResource()
pass

@staticmethod
def get_name() -> str:
return "ManualOptimizeJobResource"

def generate_node_resource(self, job: JobMeta, conf: OptimizeConfig) -> NodeResource:
configmap_reader = ConfigMapReader(job.namespace, "brain-manual-scale-config")
json_data =configmap_reader.read_json_Data()

if not isinstance(json_data, dict):
logger.warning(f"ConfigMap data is not a dictionary, type: {type(json_data)}. Using default empty resource.")
return self.current_node_resource

customize_worker_resource: Optional[Dict[str, Any]] = json_data.get("customize_worker_resource", None)

if not customize_worker_resource or not isinstance(customize_worker_resource, dict):
logger.warning("ConfigMap 'customize_worker_resource' is missing or invalid. Using default empty resource.")
return self.current_node_resource

logger.info(f"ConfigMap customize_worker_resource data: {customize_worker_resource}")

cpu_val = customize_worker_resource.get("cpu")
memory_val = customize_worker_resource.get("memory")
gpu_nums_val = customize_worker_resource.get("gpus")
gpu_type_val = customize_worker_resource.get("gpu_type")

required_fields = {"cpu": cpu_val, "memory": memory_val, "gpus": gpu_nums_val, "gpu_type": gpu_type_val}
missing_fields = [k for k, v in required_fields.items() if v is None]

if missing_fields:
logger.error(f"Missing required fields in customize_worker_resource: {missing_fields}. Using default empty resource.")
return self.current_node_resource

try:
cluster_idle_gpus = ResourceMonitor(job.namespace, gpu_type).get_cluster_idle_gpus()
except Exception as e:
logger.warning(f"Cluster check failed: {e}. Fallback to current.")
return self.current_node_resource

if not cluster_idle_gpus and cluster_idle_gpus < gpu_nums_val:
logger.warning(
f"No idle GPUs or idle GPUs not enough. Scale up aborted."
f"Need GPUs is {gpu_nums_val}, cluster idle GPUs is {cluster_idle_gpus}."
)
return self.current_node_resource

try:
cpu = int(cpu_val)
memory_gib = int(memory_val) # GiB
gpu_nums = int(gpu_nums_val)

if cpu < 0 or memory_gib < 0 or gpu_nums < 0:
raise ValueError("Resource values cannot be negative.")

# unit conversion (GiB -> Bytes)
memory_bytes = memory_gib * UnitConvertor.GIB_TO_BYTES

gpu_type = str(gpu_type_val).strip()
if not gpu_type:
logger.warning("GPU type is empty, defaulting to 'none' or handling as per business logic.")
gpu_type = "none"

except (ValueError, TypeError) as e:
logger.error(f"Invalid data type in customize_worker_resource: {e}. Using default resource.")
return self.current_node_resource
logger.info(
f"Applying custom resources -> CPU: {cpu}, Memory: {memory_gib}GiB, GPUs: {gpu_nums}, Type: {gpu_type}"
)
return NodeResource(
cpu=cpu,
memory=memory_bytes, # GiB → B
gpu=gpu_nums,
gpu_type=gpu_type,
)
Loading