Skip to content

Commit 35260c2

Browse files
committed
feat: update ray resources to match the newer/cleaner torchx resources form
this allows fixing: codeflare logs when late-attaching may not stream out gpu utilization BREAKING CHANGE: this changes the structure of the ray form; tests may need updates. Also, any automated -y runs will require an update.
1 parent 2c12e78 commit 35260c2

File tree

14 files changed

+75
-35
lines changed

14 files changed

+75
-35
lines changed

package-lock.json

Lines changed: 17 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

plugins/plugin-codeflare/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
"@types/split2": "^3.2.1"
3131
},
3232
"dependencies": {
33-
"@guidebooks/store": "^6.1.9",
33+
"@guidebooks/store": "^6.2.1",
3434
"@logdna/tail-file": "^3.0.1",
3535
"@patternfly/react-charts": "^6.94.18",
3636
"@patternfly/react-core": "^4.276.6",
3737
"asciinema-player": "^3.1.0",
3838
"chokidar": "^3.5.3",
39-
"madwizard": "^6.4.1",
39+
"madwizard": "^6.5.3",
4040
"needle": "^3.2.0",
4141
"open": "^8.4.2",
4242
"pretty-bytes": "^6.1.0",

tests/kind/profiles/non-gpu1/keep-it-simple

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-coscheduler

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-default

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"200m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.25Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-preinstalled

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/ray-autoscaler

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"0\",\"Maximum Workers\":\"0\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"2.5Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu2/keep-it-simple

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,18 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/ray-basic\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
2127
"ml/ray/cluster/choose": "codeflare-test-ray-cluster",
2228
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
2329
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
2430
}
25-
}
31+
}

tests/kind/profiles/non-gpu3/keep-it-simple

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu4/keep-it-simple

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu5/keep-it-simple

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit-with-dashdash\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 intentionally-not-main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu6/keep-it-simple

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu6/mcad-default

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"bitnami/pytorch:1.13.1\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
@@ -24,4 +24,4 @@
2424
"kubernetes/mcad/choose/job-priority": "Default Priority",
2525
"kubernetes/mcad/choose/scheduler": "MCAD with the Default Kubernetes Scheduler"
2626
}
27-
}
27+
}

tests/kind/run.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ export RAY_KUBE_CLUSTER_NAME=codeflare-test-ray-cluster
2727
export NODE=node
2828
export CODEFLARE_HEADLESS_HOME=${CODEFLARE_HEADLESS_HOME-$ROOT/dist/headless}
2929

30-
while getopts "ab:f:is:" opt
30+
while getopts "Vab:f:is:" opt
3131
do
3232
case $opt in
33+
V) VERBOSE=true; continue;;
3334
a) FORCE_ALL=true; continue;;
3435
f) FORCE=$OPTARG; continue;;
3536
s) export GUIDEBOOK_STORE=$OPTARG; echo "[Test] Using store=$GUIDEBOOK_STORE"; continue;;
@@ -79,7 +80,10 @@ function run {
7980
fi
8081

8182
local guidebook=${2-$GUIDEBOOK}
82-
local yes=$([ -z "$FORCE_ALL" ] && [ "$FORCE" != "$profileFull" ] && [ -f "$MWPROFILES_PATH/$profile" ] && echo "--yes" || echo "")
83+
local yes=${YES-$([ -z "$FORCE_ALL" ] && [ "$FORCE" != "$profileFull" ] && [ -f "$MWPROFILES_PATH/$profile" ] && echo "--yes" || echo "")}
84+
if [[ -n "$VERBOSE" ]]; then
85+
local verbose="-V"
86+
fi
8387

8488
local dashdashFile="$MWPROFILES_PATH_BASE"/$variant/dashdash.txt
8589
if [ -f "$dashdashFile" ]; then
@@ -98,7 +102,7 @@ function run {
98102
fi
99103

100104
echo "[Test] Running with variant=$variant profile=$profile yes=$yes"
101-
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $yes $guidebook -- $DASHDASH | tee $OUTPUT
105+
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $verbose $yes $guidebook -- $DASHDASH | tee $OUTPUT
102106
}
103107

104108
#

0 commit comments

Comments
 (0)