Skip to content

Commit ccc9129

Browse files
committed
Restore Rust installation step from main
1 parent 333dcc1 commit ccc9129

File tree

16 files changed

+102
-149
lines changed

16 files changed

+102
-149
lines changed

.github/workflows/gcm_python.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ jobs:
6464
path: ~/.cache/venv-ci
6565
key: ${{ env.pythonLocation }}-${{ hashFiles('dev-requirements.txt') }}
6666

67+
- name: Install Rust
68+
run: |
69+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
70+
cp -r $HOME/.cargo/bin/* $HOME/.cache/venv-ci/bin/
71+
6772
- name: Install build dependencies
6873
run: |
6974
sudo apt update --yes
Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,26 @@
11
# Copyright (c) Meta Platforms, Inc. and affiliates.
22
# All rights reserved.
3-
from gcm.monitoring.accelerator.backend import (
3+
from gcm.accelerator.backend import (
44
AcceleratorBackend,
55
BackendName,
66
DeviceHandle,
77
ProbeResult,
88
)
9-
from gcm.monitoring.accelerator.errors import (
9+
from gcm.accelerator.errors import (
1010
AcceleratorError,
1111
BackendUnavailableError,
1212
UnsupportedOperationError,
1313
)
14-
from gcm.monitoring.accelerator.manager import AcceleratorManager
15-
from gcm.monitoring.accelerator.metrics import (
16-
Capability,
17-
CapabilitySet,
18-
MetricRequest,
19-
MetricSet,
20-
)
21-
from gcm.monitoring.accelerator.registry import default_backend_factories
14+
from gcm.accelerator.manager import AcceleratorManager
15+
from gcm.accelerator.metrics import MetricRequest, MetricSet
16+
from gcm.accelerator.registry import default_backend_factories
2217

2318
__all__ = [
2419
"AcceleratorBackend",
2520
"AcceleratorError",
2621
"AcceleratorManager",
2722
"BackendName",
2823
"BackendUnavailableError",
29-
"Capability",
30-
"CapabilitySet",
3124
"DeviceHandle",
3225
"MetricRequest",
3326
"MetricSet",
Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from enum import Enum
66
from typing import Callable, List, Protocol
77

8-
from gcm.monitoring.accelerator.metrics import CapabilitySet, MetricRequest, MetricSet
8+
from gcm.accelerator.metrics import MetricRequest, MetricSet
99

1010

1111
class BackendName(str, Enum):
@@ -39,8 +39,6 @@ def probe(self) -> ProbeResult: ...
3939

4040
def enumerate_devices(self) -> List[DeviceHandle]: ...
4141

42-
def capabilities(self, device: DeviceHandle) -> CapabilitySet: ...
43-
4442
def read_metrics(
4543
self, device: DeviceHandle, request: MetricRequest
4644
) -> MetricSet: ...
File renamed without changes.

gcm/monitoring/accelerator/backends/nvml.py renamed to gcm/accelerator/backends/nvml.py

Lines changed: 25 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,19 @@
22
# All rights reserved.
33
from dataclasses import dataclass, field
44
from datetime import datetime, timezone
5-
from typing import Callable, Optional, TypeVar
5+
from typing import Any, Callable, Optional, TypeVar
66

7-
from gcm.monitoring.accelerator.backend import BackendName, DeviceHandle, ProbeResult
8-
from gcm.monitoring.accelerator.errors import (
9-
BackendUnavailableError,
10-
UnsupportedOperationError,
11-
)
12-
from gcm.monitoring.accelerator.metrics import (
13-
Capability,
14-
CapabilitySet,
15-
MetricRequest,
16-
MetricSet,
17-
)
18-
from gcm.monitoring.accelerator.probe import find_and_load_library
7+
from gcm.accelerator.backend import BackendName, DeviceHandle, ProbeResult
8+
from gcm.accelerator.errors import BackendUnavailableError, UnsupportedOperationError
9+
from gcm.accelerator.metrics import MetricRequest, MetricSet
10+
from gcm.accelerator.probe import find_and_load_library
1911
from gcm.monitoring.device_telemetry_client import (
2012
DeviceTelemetryClient,
2113
DeviceTelemetryException,
2214
)
15+
from gcm.monitoring.utils.error import safe_call
2316
from gcm.schemas.gpu.application_clock import ApplicationClockInfo
17+
2418
from gcm.schemas.gpu.memory import GPUMemory
2519
from gcm.schemas.gpu.utilization import GPUUtilization
2620

@@ -50,6 +44,7 @@ class NVMLBackend:
5044
_client: Optional[DeviceTelemetryClient] = field(
5145
default=None, init=False, repr=False
5246
)
47+
_handles: dict[str, Any] = field(default_factory=dict, init=False, repr=False)
5348

5449
def name(self) -> BackendName:
5550
return BackendName.NVML
@@ -83,7 +78,15 @@ def enumerate_devices(self) -> list[DeviceHandle]:
8378
devices: list[DeviceHandle] = []
8479
for index in range(device_count):
8580
model: Optional[str] = None
86-
handle = client.get_device_by_index(index)
81+
82+
# Check cache first or fetch handle
83+
dev_id = str(index)
84+
if dev_id in self._handles:
85+
handle = self._handles[dev_id]
86+
else:
87+
handle = client.get_device_by_index(index)
88+
self._handles[dev_id] = handle
89+
8790
model_getter = getattr(handle, "get_name", None)
8891
if callable(model_getter):
8992
maybe_model = self._safe_call(model_getter)
@@ -92,7 +95,7 @@ def enumerate_devices(self) -> list[DeviceHandle]:
9295
devices.append(
9396
DeviceHandle(
9497
backend=self.name(),
95-
id=str(index),
98+
id=dev_id,
9699
vendor="nvidia",
97100
model=model,
98101
)
@@ -101,33 +104,21 @@ def enumerate_devices(self) -> list[DeviceHandle]:
101104
except DeviceTelemetryException as e:
102105
raise UnsupportedOperationError("NVML enumerate_devices failed") from e
103106

104-
def capabilities(self, _device: DeviceHandle) -> CapabilitySet:
105-
return CapabilitySet(
106-
values={
107-
Capability.UTILIZATION,
108-
Capability.MEMORY,
109-
Capability.POWER,
110-
Capability.THERMALS,
111-
Capability.CLOCKS,
112-
Capability.ECC,
113-
Capability.PROCESSES,
114-
}
115-
)
116-
117107
@staticmethod
118108
def _safe_call(func: Callable[[], _T]) -> _T | None:
119-
try:
120-
return func()
121-
except DeviceTelemetryException:
122-
return None
109+
return safe_call(func, DeviceTelemetryException, logger_name=__name__)
123110

124111
def read_metrics(self, device: DeviceHandle, _request: MetricRequest) -> MetricSet:
125112
# TODO: Wire MetricRequest.include_process_info once process telemetry
126113
# is available through HAL MetricSet.
127114
client = self._ensure_client()
128115
try:
129-
index = int(device.id)
130-
handle = client.get_device_by_index(index)
116+
if device.id in self._handles:
117+
handle = self._handles[device.id]
118+
else:
119+
index = int(device.id)
120+
handle = client.get_device_by_index(index)
121+
self._handles[device.id] = handle
131122
except (ValueError, DeviceTelemetryException) as e:
132123
raise UnsupportedOperationError(
133124
f"invalid NVML device id: {device.id}"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# All rights reserved.
33
from dataclasses import dataclass
44

5-
from gcm.monitoring.accelerator.backend import BackendName
5+
from gcm.accelerator.backend import BackendName
66

77

88
class AcceleratorError(Exception):
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# Copyright (c) Meta Platforms, Inc. and affiliates.
22
# All rights reserved.
3-
from gcm.monitoring.accelerator.backend import (
3+
from gcm.accelerator.backend import (
44
AcceleratorBackend,
55
BackendFactory,
66
BackendName,
77
DeviceHandle,
88
ProbeResult,
99
)
10-
from gcm.monitoring.accelerator.errors import BackendOperationError
11-
from gcm.monitoring.accelerator.metrics import MetricRequest, MetricSet
10+
from gcm.accelerator.errors import BackendOperationError
11+
from gcm.accelerator.metrics import MetricRequest, MetricSet
1212

1313

1414
class AcceleratorManager:
Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,6 @@
22
# All rights reserved.
33
from dataclasses import dataclass, field
44
from datetime import datetime, timezone
5-
from enum import Enum
6-
7-
8-
class Capability(str, Enum):
9-
UTILIZATION = "utilization"
10-
MEMORY = "memory"
11-
POWER = "power"
12-
THERMALS = "thermals"
13-
CLOCKS = "clocks"
14-
ECC = "ecc"
15-
TOPOLOGY = "topology"
16-
PROCESSES = "processes"
17-
18-
19-
@dataclass(frozen=True)
20-
class CapabilitySet:
21-
values: set[Capability]
22-
23-
def supports(self, capability: Capability) -> bool:
24-
return capability in self.values
255

266

277
@dataclass(frozen=True)

gcm/accelerator/registry.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
from gcm.accelerator.backend import BackendFactory, BackendName
4+
from gcm.accelerator.backends.nvml import NVMLBackend
5+
6+
7+
def default_backend_factories() -> dict[BackendName, BackendFactory]:
8+
return {
9+
BackendName.NVML: lambda: NVMLBackend(),
10+
}

0 commit comments

Comments
 (0)