Skip to content

Commit 1acefc3

Browse files
committed
feat: update to support new ray-job-definition.json info
rather than depending on the job.json generated only when the ray job was submitted by us WARNING: tests/plugin-codeflare/dashboard/inputs/2 still needs to be updated to have the new job definition file assets
1 parent f0f1651 commit 1acefc3

File tree

15 files changed

+2869
-1759
lines changed

15 files changed

+2869
-1759
lines changed

plugins/plugin-client-default/notebooks/dashboard.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ layout:
2424
execute: now
2525
outputOnly: true
2626
---
27-
description application "$LOGDIR/job.json"
27+
description application "$LOGDIR"
2828
```
2929

3030
=== "Workers"
@@ -34,7 +34,7 @@ layout:
3434
execute: now
3535
outputOnly: true
3636
---
37-
description workers "$LOGDIR/job.json"
37+
description workers "$LOGDIR"
3838
```
3939

4040
---

plugins/plugin-codeflare/src/controller/description.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17+
import { join } from "path"
1718
import { Arguments, Registrar } from "@kui-shell/core"
1819
import { expand } from "../lib/util"
1920

@@ -42,20 +43,27 @@ export type SummaryResponse = {
4243
source: string
4344
}
4445

46+
/** Given the location of the staging directory, return the location of the ray job definition */
47+
function jobDefinition(filepath: string) {
48+
return expand(join(filepath.replace(/'/g, ""), "ray-job-definition.json"))
49+
}
50+
4551
async function app(args: Arguments) {
4652
const filepath = args.argvNoOptions[2]
4753
if (!filepath) {
4854
throw new Error("Usage: description application <filepath>")
4955
}
5056

51-
const jobInfo = JSON.parse(await args.REPL.qexec<string>(`vfs fslice ${expand(filepath)} 0`))
52-
const { RAY_IMAGE } = jobInfo.runtimeEnv.env_vars
57+
const jobInfo = JSON.parse(await args.REPL.qexec<string>(`vfs fslice ${jobDefinition(filepath)} 0`))
58+
const { RAY_IMAGE } = jobInfo.runtime_env.env_vars
59+
60+
const status = jobInfo.status.toLowerCase()
5361

5462
const summaryData = [
5563
{ label: "Application Class", value: "Unknown" }, // TODO...
5664
{ label: "Application Name", value: "Unknown" }, // TODO...
5765
{ label: "Base Image", value: RAY_IMAGE },
58-
{ label: "Run Status", value: process.env.FOLLOW ? "Running" : "Done" }, // TODO...
66+
{ label: "Run Status", value: status ? status[0].toUpperCase() + status.slice(1) : "Unknown" },
5967
]
6068

6169
const React = await import("react")
@@ -72,8 +80,8 @@ async function workers(args: Arguments) {
7280
throw new Error("Usage: description workers <filepath>")
7381
}
7482

75-
const jobInfo = JSON.parse(await args.REPL.qexec<string>(`vfs fslice ${expand(filepath)} 0`))
76-
const { KUBE_CONTEXT, KUBE_NS, WORKER_MEMORY, MIN_WORKERS, MAX_WORKERS } = jobInfo.runtimeEnv.env_vars
83+
const jobInfo = JSON.parse(await args.REPL.qexec<string>(`vfs fslice ${jobDefinition(filepath)} 0`))
84+
const { KUBE_CONTEXT, KUBE_NS, WORKER_MEMORY, MIN_WORKERS, MAX_WORKERS } = jobInfo.runtime_env.env_vars
7785

7886
const summaryData = [
7987
{ label: "Cluster Context", value: KUBE_CONTEXT.replace(/^[^/]+\//, "") },

tests/plugin-codeflare/dashboard/inputs/1/README.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/plugin-codeflare/dashboard/inputs/1/choices.json

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"madwizard/apriori/arch": "x64",
44
"madwizard/apriori/platform": "darwin",
55
"madwizard/apriori/mac-installer": "Homebrew",
6-
"madwizard/apriori/in-terminal": "Text",
6+
"madwizard/apriori/in-terminal": "HTML",
77
"Training####Fine Tuning": "Fine Tuning",
88
"GLUE": "GLUE",
99
"AWS####IBM": "AWS",
@@ -21,5 +21,12 @@
2121
"Number of CPUs####Number of GPUs": "{\"Number of CPUs\":4,\"Number of GPUs\":3}",
2222
"expand(echo ${A-error} ; echo ${B-4} ; echo ${C-5})": "3",
2323
"XXXXXX.11111####222222": "11111",
24-
"YYYYYY.11111####222222": "222222"
24+
"YYYYYY.11111####222222": "222222",
25+
"My Cluster is Running Locally####My Cluster is Runing on Kubernetes": "My Cluster is Runing on Kubernetes",
26+
"expand([ -n \"$RAY_ADDRESS\" ] && ray job list --address $RAY_ADDRESS | tail +2 | awk '{print $1}' | sed \"s/[:{' ]//g\", Ray Runs)": "07a2647f-3656-4e3e-836c-95a2fa841af6",
27+
"expand([ -n \"$RAY_ADDRESS\" ] && curl $RAY_ADDRESS/api/jobs/ | jq -r 'keys | .[]', Ray Runs)": "d5a0d68f-a675-49ca-bff7-4ae762e6b146",
28+
"My Cluster is Running Locally####My Cluster is Running on Kubernetes": "My Cluster is Running on Kubernetes",
29+
"expand([ -n \"$RAY_ADDRESS\" ] && curl $RAY_ADDRESS/api/jobs/ | jq -r 'to_entries | sort_by(.value.start_time) | reverse | .[] | \"\\(.key) \\(.value.status) \\(.value.entrypoint)\"' | sed -E 's/python3 ([^[:space:]])+ //g' | awk '{a=$1;b=$2; $1=\"\";$2=\"\";print \"\\033;1m\" a, \"\\033[0;33m\" b \"\\033[0;2m\" $0 \"\\033[0m\"}', Ray Runs)": "\u001b;1ma88d4632-ab5c-4350-a770-d39a955c42c8 \u001b[0;33mRUNNING\u001b[0;2m -v --datapath /tmp/ --modelpath /tmp/ --logpath /tmp/ --tblogpath s3://browsey/codeflare/a88d4632-ab5c-4350-a770-d39a955c42c8/tensorboard/ --num_workers 1\u001b[0m",
30+
"expand([ -n \"$RAY_ADDRESS\" ] && curl $RAY_ADDRESS/api/jobs/ | jq -r 'to_entries | sort_by(.value.start_time) | reverse | .[] | \"\\(.key) \\(.value.status) \\(.value.entrypoint)\"' | sed -E 's/python3 ([^[:space:]])+ //g' | awk '{a=$1;b=$2; $1=\"\";$2=\"\";print a, \"\\033[33m\" b \"\\033[0;2m\" $0 \"\\033[0m\"}', Ray Runs)": "a88d4632-ab5c-4350-a770-d39a955c42c8 \u001b[33mRUNNING\u001b[0;2m -v --datapath /tmp/ --modelpath /tmp/ --logpath /tmp/ --tblogpath s3://browsey/codeflare/a88d4632-ab5c-4350-a770-d39a955c42c8/tensorboard/ --num_workers 1\u001b[0m",
31+
"expand([ -n \"$RAY_ADDRESS\" ] && curl $RAY_ADDRESS/api/jobs/ | jq -r 'to_entries | sort_by(.value.start_time) | reverse | .[] | \"\\(.key) \\(.value.status) \\(.value.start_time / 1000 | strflocaltime(\"%Y-%m-%dT%H:%M:%S\")) \\(.value.entrypoint)\"' | sed -E 's/python3 ([^[:space:]])+ //g' | awk '{a=$1;b=$2;c=$3; $1=\"\";$2=\"\";$3=\"\"; print a, \"\\033[0;36m\" c, \"\\033[0;1;33m\" b \"\\033[0;2m\" $0 \"\\033[0m\"}', Ray Runs)": "505b98b6-a258-4afd-bdc6-ddf84d3f2862 \u001b[0;36m2022-07-07T13:27:54 \u001b[0;1;33mRUNNING\u001b[0;2m -v --datapath /tmp/ --modelpath /tmp/ --logpath /tmp/ --tblogpath s3://browsey/codeflare/505b98b6-a258-4afd-bdc6-ddf84d3f2862/tensorboard/ --num_workers 1\u001b[0m"
2532
}
Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +0,0 @@
1-
LAST SEEN TYPE REASON OBJECT MESSAGE
2-
0s Normal Scheduled pod/mycluster-ray-head-type-krlr4 Successfully assigned nvidia-gpu-operator/mycluster-ray-head-type-krlr4 to ip-10-0-128-169.ec2.internal
3-
0s Normal AddedInterface pod/mycluster-ray-head-type-krlr4 Add eth0 [10.128.44.144/23] from openshift-sdn
4-
0s Normal Pulling pod/mycluster-ray-head-type-krlr4 Pulling image "rayproject/ray-ml:1.13.0-py37-gpu"
5-
0s Normal Pulled pod/mycluster-ray-head-type-krlr4 Successfully pulled image "rayproject/ray-ml:1.13.0-py37-gpu" in 6m48.700535275s
6-
0s Normal Created pod/mycluster-ray-head-type-krlr4 Created container ray-node
7-
0s Normal Started pod/mycluster-ray-head-type-krlr4 Started container ray-node
8-
0s Normal Scheduled pod/mycluster-ray-worker-type-6r7hp Successfully assigned nvidia-gpu-operator/mycluster-ray-worker-type-6r7hp to ip-10-0-133-106.ec2.internal
9-
0s Normal AddedInterface pod/mycluster-ray-worker-type-6r7hp Add eth0 [10.131.42.42/23] from openshift-sdn
10-
0s Normal Pulling pod/mycluster-ray-worker-type-6r7hp Pulling image "rayproject/ray-ml:1.13.0-py37-gpu"
11-
0s Normal Pulled pod/mycluster-ray-worker-type-6r7hp Successfully pulled image "rayproject/ray-ml:1.13.0-py37-gpu" in 6m14.380152399s
12-
0s Normal Created pod/mycluster-ray-worker-type-6r7hp Created container ray-node
13-
0s Normal Started pod/mycluster-ray-worker-type-6r7hp Started container ray-node

tests/plugin-codeflare/dashboard/inputs/1/job.json

Lines changed: 21 additions & 19 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
87ed37bc-a837-4f20-aad0-0a12754452f4
1+
bf3d5456-fba9-42b1-9b55-119366b409ee

0 commit comments

Comments
 (0)