Skip to content

Commit f3cd5b3

Browse files
committed
fix: port forward for local interactive tests
1 parent 028d0d5 commit f3cd5b3

File tree

3 files changed

+372
-36
lines changed

3 files changed

+372
-36
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,36 @@ jobs:
8686
with:
8787
user-name: sdk-user
8888

89+
- name: Grant sdk-user port-forwarding permissions
90+
run: |
91+
cat <<EOF | kubectl apply -f -
92+
apiVersion: rbac.authorization.k8s.io/v1
93+
kind: ClusterRole
94+
metadata:
95+
name: port-forward-permissions
96+
rules:
97+
- apiGroups: [""]
98+
resources: ["services", "pods"]
99+
verbs: ["get", "list", "watch"]
100+
- apiGroups: [""]
101+
resources: ["pods/portforward"]
102+
verbs: ["create"]
103+
---
104+
apiVersion: rbac.authorization.k8s.io/v1
105+
kind: ClusterRoleBinding
106+
metadata:
107+
name: sdk-user-port-forward-binding
108+
subjects:
109+
- kind: User
110+
name: sdk-user
111+
apiGroup: rbac.authorization.k8s.io
112+
roleRef:
113+
kind: ClusterRole
114+
name: port-forward-permissions
115+
apiGroup: rbac.authorization.k8s.io
116+
EOF
117+
shell: bash
118+
89119
- name: Configure RBAC for sdk user with limited permissions
90120
run: |
91121
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -117,7 +147,7 @@ jobs:
117147
pip install poetry
118148
poetry install --with test,docs
119149
echo "Running e2e tests..."
120-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
150+
poetry run pytest -v -s --log-cli-level=INFO ./tests/e2e/local_interactive_sdk_kind_test.py::TestRayLocalInteractiveOauth::test_local_interactives_nvidia_gpu > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
121151
env:
122152
GRPC_DNS_RESOLVER: "native"
123153

Lines changed: 126 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from codeflare_sdk import (
22
Cluster,
33
ClusterConfiguration,
4-
TokenAuthentication,
54
generate_cert,
65
)
76

87
import pytest
98
import ray
109
import math
10+
import time
11+
import subprocess
1112

1213
from support import *
1314

@@ -16,8 +17,17 @@
1617
class TestRayLocalInteractiveOauth:
1718
def setup_method(self):
1819
initialize_kubernetes_client(self)
20+
self.port_forward_process = None
1921

2022
def teardown_method(self):
23+
if self.port_forward_process:
24+
self.port_forward_process.terminate()
25+
try:
26+
self.port_forward_process.wait(timeout=10)
27+
except subprocess.TimeoutExpired:
28+
self.port_forward_process.kill()
29+
self.port_forward_process.wait()
30+
self.port_forward_process = None
2131
delete_namespace(self)
2232
delete_kueue_resources(self)
2333

@@ -39,6 +49,8 @@ def run_local_interactives(
3949
):
4050
cluster_name = "test-ray-cluster-li"
4151

52+
ray.shutdown()
53+
4254
cluster = Cluster(
4355
ClusterConfiguration(
4456
name=cluster_name,
@@ -49,45 +61,124 @@ def run_local_interactives(
4961
head_memory_requests=2,
5062
head_memory_limits=2,
5163
worker_cpu_requests="500m",
52-
worker_cpu_limits=1,
64+
worker_cpu_limits="500m",
5365
worker_memory_requests=1,
5466
worker_memory_limits=4,
5567
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
56-
write_to_file=True,
5768
verify_tls=False,
5869
)
5970
)
60-
cluster.up()
61-
cluster.wait_ready()
62-
63-
generate_cert.generate_tls_cert(cluster_name, self.namespace)
64-
generate_cert.export_env(cluster_name, self.namespace)
65-
66-
print(cluster.local_client_url())
6771

68-
ray.shutdown()
69-
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
70-
71-
@ray.remote(num_gpus=number_of_gpus / 2)
72-
def heavy_calculation_part(num_iterations):
73-
result = 0.0
74-
for i in range(num_iterations):
75-
for j in range(num_iterations):
76-
for k in range(num_iterations):
77-
result += math.sin(i) * math.cos(j) * math.tan(k)
78-
return result
79-
80-
@ray.remote(num_gpus=number_of_gpus / 2)
81-
def heavy_calculation(num_iterations):
82-
results = ray.get(
83-
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
72+
try:
73+
cluster.up()
74+
75+
cluster.wait_ready()
76+
cluster.status()
77+
78+
TIMEOUT = 300 # 5 minutes
79+
END = time.time() + TIMEOUT
80+
81+
head_pod_name = None
82+
worker_pod_name = None
83+
84+
while time.time() < END:
85+
if not head_pod_name:
86+
head_pod_name = kubectl_get_pod_name_by_substring(
87+
self.namespace, cluster_name, "head"
88+
)
89+
if not worker_pod_name:
90+
worker_pod_name = kubectl_get_pod_name_by_substring(
91+
self.namespace, cluster_name, "worker"
92+
)
93+
94+
head_status = (
95+
kubectl_get_pod_status(self.namespace, head_pod_name)
96+
if head_pod_name
97+
else "NotFound"
98+
)
99+
worker_status = (
100+
kubectl_get_pod_status(self.namespace, worker_pod_name)
101+
if worker_pod_name
102+
else "NotFound"
103+
)
104+
105+
if (
106+
head_pod_name
107+
and worker_pod_name
108+
and "Running" in head_status
109+
and "Running" in worker_status
110+
):
111+
head_ready = kubectl_get_pod_ready(self.namespace, head_pod_name)
112+
worker_ready = kubectl_get_pod_ready(
113+
self.namespace, worker_pod_name
114+
)
115+
if head_ready and worker_ready:
116+
break
117+
time.sleep(10)
118+
119+
generate_cert.generate_tls_cert(cluster_name, self.namespace)
120+
generate_cert.export_env(cluster_name, self.namespace)
121+
122+
local_port = "20001"
123+
ray_client_port = "10001"
124+
head_service_name = f"{cluster_name}-head-svc"
125+
126+
port_forward_cmd = [
127+
"kubectl",
128+
"port-forward",
129+
"-n",
130+
self.namespace,
131+
f"svc/{head_service_name}",
132+
f"{local_port}:{ray_client_port}",
133+
]
134+
self.port_forward_process = subprocess.Popen(
135+
port_forward_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
84136
)
85-
return sum(results)
86-
87-
ref = heavy_calculation.remote(3000)
88-
result = ray.get(ref)
89-
assert result == 1789.4644387076714
90-
ray.cancel(ref)
91-
ray.shutdown()
92-
93-
cluster.down()
137+
time.sleep(5)
138+
139+
client_url = f"ray://localhost:{local_port}"
140+
cluster.status()
141+
142+
ray.init(address=client_url, logging_level="INFO")
143+
144+
@ray.remote(num_gpus=number_of_gpus / 2)
145+
def heavy_calculation_part(num_iterations):
146+
result = 0.0
147+
for i in range(num_iterations):
148+
for j in range(num_iterations):
149+
for k in range(num_iterations):
150+
result += math.sin(i) * math.cos(j) * math.tan(k)
151+
return result
152+
153+
@ray.remote(num_gpus=number_of_gpus / 2)
154+
def heavy_calculation(num_iterations):
155+
results = ray.get(
156+
[
157+
heavy_calculation_part.remote(num_iterations // 30)
158+
for _ in range(30)
159+
]
160+
)
161+
return sum(results)
162+
163+
ref = heavy_calculation.remote(3000)
164+
165+
try:
166+
result = ray.get(ref)
167+
assert result == 1789.4644387076714
168+
except Exception as e:
169+
raise
170+
finally:
171+
ray.cancel(ref)
172+
173+
ray.shutdown()
174+
175+
finally:
176+
if self.port_forward_process:
177+
self.port_forward_process.terminate()
178+
try:
179+
self.port_forward_process.wait(timeout=10)
180+
except subprocess.TimeoutExpired:
181+
self.port_forward_process.kill()
182+
self.port_forward_process.wait()
183+
self.port_forward_process = None
184+
cluster.down()

0 commit comments

Comments
 (0)