From 1ddc998774af2a728eafb57071a6c5c7181a1ca4 Mon Sep 17 00:00:00 2001 From: Jon Huhn Date: Tue, 14 Jan 2025 12:42:50 -0600 Subject: [PATCH 1/2] Verify environment variables in e2e test --- test/e2e/e2e.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/test/e2e/e2e.sh b/test/e2e/e2e.sh index cf7a9a7..2cf9803 100755 --- a/test/e2e/e2e.sh +++ b/test/e2e/e2e.sh @@ -27,6 +27,17 @@ kubectl create -f demo/gpu-test3.yaml kubectl create -f demo/gpu-test4.yaml kubectl create -f demo/gpu-test5.yaml +function verify-env { + local namespace="$1" + local pod="$2" + for ctr in $(kubectl get pod -n "$namespace" "$pod" -o jsonpath='{.spec.containers[*].name}'); do + if ! kubectl logs -n "$namespace" "$pod" -c "$ctr" | grep -q "GPU_DEVICE_"; then + echo "Pod $namespace/$pod, container $ctr missing GPU_DEVICE_ environment variables" + exit 1 + fi + done +} + kubectl wait --for=condition=Ready -n gpu-test1 pod/pod0 --timeout=120s kubectl wait --for=condition=Ready -n gpu-test1 pod/pod1 --timeout=120s gpu_test_1=$(kubectl get pods -n gpu-test1 | grep -c 'Running') @@ -34,6 +45,8 @@ if [ $gpu_test_1 != 2 ]; then echo "gpu_test_1 $gpu_test_1 failed to match against 2 expected pods" exit 1 fi +verify-env gpu-test1 pod0 +verify-env gpu-test1 pod1 kubectl wait --for=condition=Ready -n gpu-test2 pod/pod0 --timeout=120s @@ -42,6 +55,7 @@ if [ $gpu_test_2 != 1 ]; then echo "gpu_test_2 $gpu_test_2 failed to match against 1 expected pod" exit 1 fi +verify-env gpu-test2 pod0 kubectl wait --for=condition=Ready -n gpu-test3 pod/pod0 --timeout=120s gpu_test_3=$(kubectl get pods -n gpu-test3 | grep -c 'Running') @@ -49,14 +63,17 @@ if [ $gpu_test_3 != 1 ]; then echo "gpu_test_3 $gpu_test_3 failed to match against 1 expected pod" exit 1 fi +verify-env gpu-test3 pod0 kubectl wait --for=condition=Ready -n gpu-test4 pod/pod0 --timeout=120s kubectl wait --for=condition=Ready -n gpu-test4 pod/pod1 --timeout=120s gpu_test_4=$(kubectl get pods -n gpu-test4 | grep -c 'Running') if [ $gpu_test_4 != 2 ]; then - echo "gpu_test_4 $gpu_test_4 failed to match against 1 expected pods" + echo "gpu_test_4 $gpu_test_4 failed to match against 2 expected pods" exit 1 fi +verify-env gpu-test4 pod0 +verify-env gpu-test4 pod1 kubectl wait --for=condition=Ready -n gpu-test5 pod/pod0 --timeout=120s gpu_test_5=$(kubectl get pods -n gpu-test5 | grep -c 'Running') @@ -64,6 +81,7 @@ if [ $gpu_test_5 != 1 ]; then echo "gpu_test_5 $gpu_test_5 failed to match against 1 expected pod" exit 1 fi +verify-env gpu-test5 pod0 # test that deletion is fast (less than the default grace period of 30s) # see https://github.com/kubernetes/kubernetes/issues/127188 for details From a4795f1e51aca3fef219c54494529c08573833da Mon Sep 17 00:00:00 2001 From: Jon Huhn Date: Wed, 15 Jan 2025 16:04:25 -0600 Subject: [PATCH 2/2] fixup! Verify environment variables in e2e test --- test/e2e/e2e.sh | 294 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 279 insertions(+), 15 deletions(-) diff --git a/test/e2e/e2e.sh b/test/e2e/e2e.sh index 2cf9803..95d3102 100755 --- a/test/e2e/e2e.sh +++ b/test/e2e/e2e.sh @@ -27,15 +27,41 @@ kubectl create -f demo/gpu-test3.yaml kubectl create -f demo/gpu-test4.yaml kubectl create -f demo/gpu-test5.yaml -function verify-env { - local namespace="$1" - local pod="$2" - for ctr in $(kubectl get pod -n "$namespace" "$pod" -o jsonpath='{.spec.containers[*].name}'); do - if ! kubectl logs -n "$namespace" "$pod" -c "$ctr" | grep -q "GPU_DEVICE_"; then - echo "Pod $namespace/$pod, container $ctr missing GPU_DEVICE_ environment variables" - exit 1 - fi +function gpus-from-logs { + local logs="$1" + echo "$logs" | sed -nE "s/^declare -x GPU_DEVICE_[[:digit:]]+=\"(.+)\"$/\1/p" +} + +function gpu-id { + local gpu="$1" + echo "$gpu" | sed -nE "s/^gpu-([[:digit:]]+)$/\1/p" +} + +function gpu-sharing-strategy-from-logs { + local logs="$1" + local id="$2" + echo "$logs" | sed -nE "s/^declare -x GPU_DEVICE_${id}_SHARING_STRATEGY=\"(.+)\"$/\1/p" +} + +function gpu-timeslice-interval-from-logs { + local logs="$1" + local id="$2" + echo "$logs" | sed -nE "s/^declare -x GPU_DEVICE_${id}_TIMESLICE_INTERVAL=\"(.+)\"$/\1/p" +} + +function gpu-partition-count-from-logs { + local logs="$1" + local id="$2" + echo "$logs" | sed -nE "s/^declare -x GPU_DEVICE_${id}_PARTITION_COUNT=\"(.+)\"$/\1/p" +} + +declare -a observed_gpus +function gpu-already-seen { + local gpu="$1" + for seen in "${observed_gpus[@]}"; do + if [[ "$gpu" == "$seen" ]]; then return 0; fi; done + return 1 } kubectl wait --for=condition=Ready -n gpu-test1 pod/pod0 --timeout=120s @@ -45,8 +71,36 @@ if [ $gpu_test_1 != 2 ]; then echo "gpu_test_1 $gpu_test_1 failed to match against 2 expected pods" exit 1 fi -verify-env gpu-test1 pod0 -verify-env gpu-test1 pod1 + +gpu_test1_pod0_ctr0_logs=$(kubectl logs -n gpu-test1 pod0 -c ctr0) +gpu_test1_pod0_ctr0_gpus=$(gpus-from-logs "$gpu_test1_pod0_ctr0_logs") +gpu_test1_pod0_ctr0_gpus_count=$(echo "$gpu_test1_pod0_ctr0_gpus" | wc -w) +if [[ $gpu_test1_pod0_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test1/pod0, container ctr0 to have 1 GPU, but got $gpu_test1_pod0_ctr0_gpus_count: $gpu_test1_pod0_ctr0_gpus" + exit 1 +fi +gpu_test1_pod0_ctr0_gpu="$gpu_test1_pod0_ctr0_gpus" +if gpu-already-seen "$gpu_test1_pod0_ctr0_gpu"; then + echo "Pod gpu-test1/pod0, container ctr0 should have a new GPU but claimed $gpu_test1_pod0_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test1/pod0, container ctr0 claimed $gpu_test1_pod0_ctr0_gpu" +observed_gpus+=("$gpu_test1_pod0_ctr0_gpu") + +gpu_test1_pod1_ctr0_logs=$(kubectl logs -n gpu-test1 pod1 -c ctr0) +gpu_test1_pod1_ctr0_gpus=$(gpus-from-logs "$gpu_test1_pod1_ctr0_logs") +gpu_test1_pod1_ctr0_gpus_count=$(echo "$gpu_test1_pod1_ctr0_gpus" | wc -w) +if [[ $gpu_test1_pod1_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test1/pod1, container ctr0 to have 1 GPU, but got $gpu_test1_pod1_ctr0_gpus_count: $gpu_test1_pod1_ctr0_gpus" + exit 1 +fi +gpu_test1_pod1_ctr0_gpu="$gpu_test1_pod1_ctr0_gpus" +if gpu-already-seen "$gpu_test1_pod1_ctr0_gpu"; then + echo "Pod gpu-test1/pod1, container ctr0 should have a new GPU but claimed $gpu_test1_pod1_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test1/pod1, container ctr0 claimed $gpu_test1_pod1_ctr0_gpu" +observed_gpus+=("$gpu_test1_pod1_ctr0_gpu") kubectl wait --for=condition=Ready -n gpu-test2 pod/pod0 --timeout=120s @@ -55,7 +109,23 @@ if [ $gpu_test_2 != 1 ]; then echo "gpu_test_2 $gpu_test_2 failed to match against 1 expected pod" exit 1 fi -verify-env gpu-test2 pod0 + +gpu_test2_pod0_ctr0_logs=$(kubectl logs -n gpu-test2 pod0 -c ctr0) +gpu_test2_pod0_ctr0_gpus=$(gpus-from-logs "$gpu_test2_pod0_ctr0_logs") +gpu_test2_pod0_ctr0_gpus_count=$(echo "$gpu_test2_pod0_ctr0_gpus" | wc -w) +if [[ $gpu_test2_pod0_ctr0_gpus_count != 2 ]]; then + echo "Expected Pod gpu-test2/pod0, container ctr0 to have 2 GPUs, but got $gpu_test2_pod0_ctr0_gpus_count: $gpu_test2_pod0_ctr0_gpus" + exit 1 +fi +echo "$gpu_test2_pod0_ctr0_gpus" | while read gpu_test2_pod0_ctr0_gpu; do + if gpu-already-seen "$gpu_test2_pod0_ctr0_gpu"; then + echo "Pod gpu-test2/pod0, container ctr0 should have a new GPU but claimed $gpu_test2_pod0_ctr0_gpu which is already claimed" + exit 1 + fi + echo "Pod gpu-test2/pod0, container ctr0 claimed $gpu_test2_pod0_ctr0_gpu" + observed_gpus+=("$gpu_test2_pod0_ctr0_gpu") +done + kubectl wait --for=condition=Ready -n gpu-test3 pod/pod0 --timeout=120s gpu_test_3=$(kubectl get pods -n gpu-test3 | grep -c 'Running') @@ -63,7 +133,56 @@ if [ $gpu_test_3 != 1 ]; then echo "gpu_test_3 $gpu_test_3 failed to match against 1 expected pod" exit 1 fi -verify-env gpu-test3 pod0 + +gpu_test3_pod0_ctr0_logs=$(kubectl logs -n gpu-test3 pod0 -c ctr0) +gpu_test3_pod0_ctr0_gpus=$(gpus-from-logs "$gpu_test3_pod0_ctr0_logs") +gpu_test3_pod0_ctr0_gpus_count=$(echo "$gpu_test3_pod0_ctr0_gpus" | wc -w) +if [[ $gpu_test3_pod0_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr0 to have 1 GPU, but got $gpu_test3_pod0_ctr0_gpus_count: $gpu_test3_pod0_ctr0_gpus" + exit 1 +fi +gpu_test3_pod0_ctr0_gpu="$gpu_test3_pod0_ctr0_gpus" +if gpu-already-seen "$gpu_test3_pod0_ctr0_gpu"; then + echo "Pod gpu-test3/pod0, container ctr0 should have a new GPU but claimed $gpu_test3_pod0_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test3/pod0, container ctr0 claimed $gpu_test3_pod0_ctr0_gpu" +observed_gpus+=("$gpu_test3_pod0_ctr0_gpu") +gpu_test3_pod0_ctr0_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test3_pod0_ctr0_logs" $(gpu-id "$gpu_test3_pod0_ctr0_gpu")) +if [[ "$gpu_test3_pod0_ctr0_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr0 to have sharing strategy TimeSlicing, got $gpu_test3_pod0_ctr0_sharing_strategy" + exit 1 +fi +gpu_test3_pod0_ctr0_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test3_pod0_ctr0_logs" $(gpu-id "$gpu_test3_pod0_ctr0_gpu")) +if [[ "$gpu_test3_pod0_ctr0_timeslice_interval" != "Default" ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr0 to have timeslice interval Default, got $gpu_test3_pod0_ctr0_timeslice_interval" + exit 1 +fi + +gpu_test3_pod0_ctr1_logs=$(kubectl logs -n gpu-test3 pod0 -c ctr1) +gpu_test3_pod0_ctr1_gpus=$(gpus-from-logs "$gpu_test3_pod0_ctr1_logs") +gpu_test3_pod0_ctr1_gpus_count=$(echo "$gpu_test3_pod0_ctr1_gpus" | wc -w) +if [[ $gpu_test3_pod0_ctr1_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr1 to have 1 GPU, but got $gpu_test3_pod0_ctr1_gpus_count: $gpu_test3_pod0_ctr1_gpus" + exit 1 +fi +gpu_test3_pod0_ctr1_gpu="$gpu_test3_pod0_ctr1_gpus" +echo "Pod gpu-test3/pod0, container ctr1 claimed $gpu_test3_pod0_ctr1_gpu" +if [[ "$gpu_test3_pod0_ctr1_gpu" != "$gpu_test3_pod0_ctr0_gpu" ]]; then + echo "Pod gpu-test3/pod0, container ctr1 should claim the same GPU as Pod gpu-test3/pod0, container ctr0, but did not" + exit 1 +fi +gpu_test3_pod0_ctr1_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test3_pod0_ctr1_logs" $(gpu-id "$gpu_test3_pod0_ctr1_gpu")) +if [[ "$gpu_test3_pod0_ctr1_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr1 to have sharing strategy TimeSlicing, got $gpu_test3_pod0_ctr1_sharing_strategy" + exit 1 +fi +gpu_test3_pod0_ctr1_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test3_pod0_ctr1_logs" $(gpu-id "$gpu_test3_pod0_ctr1_gpu")) +if [[ "$gpu_test3_pod0_ctr1_timeslice_interval" != "Default" ]]; then + echo "Expected Pod gpu-test3/pod0, container ctr1 to have timeslice interval Default, got $gpu_test3_pod0_ctr1_timeslice_interval" + exit 1 +fi + kubectl wait --for=condition=Ready -n gpu-test4 pod/pod0 --timeout=120s kubectl wait --for=condition=Ready -n gpu-test4 pod/pod1 --timeout=120s @@ -72,8 +191,56 @@ if [ $gpu_test_4 != 2 ]; then echo "gpu_test_4 $gpu_test_4 failed to match against 2 expected pods" exit 1 fi -verify-env gpu-test4 pod0 -verify-env gpu-test4 pod1 + +gpu_test4_pod0_ctr0_logs=$(kubectl logs -n gpu-test4 pod0 -c ctr0) +gpu_test4_pod0_ctr0_gpus=$(gpus-from-logs "$gpu_test4_pod0_ctr0_logs") +gpu_test4_pod0_ctr0_gpus_count=$(echo "$gpu_test4_pod0_ctr0_gpus" | wc -w) +if [[ $gpu_test4_pod0_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test4/pod0, container ctr0 to have 1 GPU, but got $gpu_test4_pod0_ctr0_gpus_count: $gpu_test4_pod0_ctr0_gpus" + exit 1 +fi +gpu_test4_pod0_ctr0_gpu="$gpu_test4_pod0_ctr0_gpus" +if gpu-already-seen "$gpu_test4_pod0_ctr0_gpu"; then + echo "Pod gpu-test4/pod0, container ctr0 should have a new GPU but claimed $gpu_test4_pod0_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test4/pod0, container ctr0 claimed $gpu_test4_pod0_ctr0_gpu" +observed_gpus+=("$gpu_test4_pod0_ctr0_gpu") +gpu_test4_pod0_ctr0_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test4_pod0_ctr0_logs" $(gpu-id "$gpu_test4_pod0_ctr0_gpu")) +if [[ "$gpu_test4_pod0_ctr0_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test4/pod0, container ctr0 to have sharing strategy TimeSlicing, got $gpu_test4_pod0_ctr0_sharing_strategy" + exit 1 +fi +gpu_test4_pod0_ctr0_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test4_pod0_ctr0_logs" $(gpu-id "$gpu_test4_pod0_ctr0_gpu")) +if [[ "$gpu_test4_pod0_ctr0_timeslice_interval" != "Default" ]]; then + echo "Expected Pod gpu-test4/pod0, container ctr0 to have timeslice interval Default, got $gpu_test4_pod0_ctr0_timeslice_interval" + exit 1 +fi + +gpu_test4_pod1_ctr0_logs=$(kubectl logs -n gpu-test4 pod1 -c ctr0) +gpu_test4_pod1_ctr0_gpus=$(gpus-from-logs "$gpu_test4_pod1_ctr0_logs") +gpu_test4_pod1_ctr0_gpus_count=$(echo "$gpu_test4_pod1_ctr0_gpus" | wc -w) +if [[ $gpu_test4_pod1_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test4/pod1, container ctr0 to have 1 GPU, but got $gpu_test4_pod1_ctr0_gpus_count: $gpu_test4_pod1_ctr0_gpus" + exit 1 +fi +gpu_test4_pod1_ctr0_gpu="$gpu_test4_pod1_ctr0_gpus" +echo "Pod gpu-test4/pod1, container ctr0 claimed $gpu_test4_pod1_ctr0_gpu" +if [[ "$gpu_test4_pod1_ctr0_gpu" != "$gpu_test4_pod1_ctr0_gpu" ]]; then + echo "Pod gpu-test4/pod1, container ctr0 should claim the same GPU as Pod gpu-test4/pod1, container ctr0, but did not" + exit 1 +fi +gpu_test4_pod1_ctr0_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test4_pod1_ctr0_logs" $(gpu-id "$gpu_test4_pod1_ctr0_gpu")) +if [[ "$gpu_test4_pod1_ctr0_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test4/pod1, container ctr0 to have sharing strategy TimeSlicing, got $gpu_test4_pod1_ctr0_sharing_strategy" + exit 1 +fi +gpu_test4_pod1_ctr0_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test4_pod1_ctr0_logs" $(gpu-id "$gpu_test4_pod1_ctr0_gpu")) +if [[ "$gpu_test4_pod1_ctr0_timeslice_interval" != "Default" ]]; then + echo "Expected Pod gpu-test4/pod1, container ctr0 to have timeslice interval Default, got $gpu_test4_pod1_ctr0_timeslice_interval" + exit 1 +fi + kubectl wait --for=condition=Ready -n gpu-test5 pod/pod0 --timeout=120s gpu_test_5=$(kubectl get pods -n gpu-test5 | grep -c 'Running') @@ -81,7 +248,104 @@ if [ $gpu_test_5 != 1 ]; then echo "gpu_test_5 $gpu_test_5 failed to match against 1 expected pod" exit 1 fi -verify-env gpu-test5 pod0 + +gpu_test5_pod0_ts_ctr0_logs=$(kubectl logs -n gpu-test5 pod0 -c ts-ctr0) +gpu_test5_pod0_ts_ctr0_gpus=$(gpus-from-logs "$gpu_test5_pod0_ts_ctr0_logs") +gpu_test5_pod0_ts_ctr0_gpus_count=$(echo "$gpu_test5_pod0_ts_ctr0_gpus" | wc -w) +if [[ $gpu_test5_pod0_ts_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr0 to have 1 GPU, but got $gpu_test5_pod0_ts_ctr0_gpus_count: $gpu_test5_pod0_ts_ctr0_gpus" + exit 1 +fi +gpu_test5_pod0_ts_ctr0_gpu="$gpu_test5_pod0_ts_ctr0_gpus" +if gpu-already-seen "$gpu_test5_pod0_ts_ctr0_gpu"; then + echo "Pod gpu-test5/pod0, container ts-ctr0 should have a new GPU but claimed $gpu_test5_pod0_ts_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test5/pod0, container ts-ctr0 claimed $gpu_test5_pod0_ts_ctr0_gpu" +observed_gpus+=("$gpu_test5_pod0_ts_ctr0_gpu") +gpu_test5_pod0_ts_ctr0_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test5_pod0_ts_ctr0_logs" $(gpu-id "$gpu_test5_pod0_ts_ctr0_gpu")) +if [[ "$gpu_test5_pod0_ts_ctr0_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr0 to have sharing strategy TimeSlicing, got $gpu_test5_pod0_ts_ctr0_sharing_strategy" + exit 1 +fi +gpu_test5_pod0_ts_ctr0_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test5_pod0_ts_ctr0_logs" $(gpu-id "$gpu_test5_pod0_ts_ctr0_gpu")) +if [[ "$gpu_test5_pod0_ts_ctr0_timeslice_interval" != "Long" ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr0 to have timeslice interval Long, got $gpu_test5_pod0_ts_ctr0_timeslice_interval" + exit 1 +fi + +gpu_test5_pod0_ts_ctr1_logs=$(kubectl logs -n gpu-test5 pod0 -c ts-ctr1) +gpu_test5_pod0_ts_ctr1_gpus=$(gpus-from-logs "$gpu_test5_pod0_ts_ctr1_logs") +gpu_test5_pod0_ts_ctr1_gpus_count=$(echo "$gpu_test5_pod0_ts_ctr1_gpus" | wc -w) +if [[ $gpu_test5_pod0_ts_ctr1_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr1 to have 1 GPU, but got $gpu_test5_pod0_ts_ctr1_gpus_count: $gpu_test5_pod0_ts_ctr1_gpus" + exit 1 +fi +gpu_test5_pod0_ts_ctr1_gpu="$gpu_test5_pod0_ts_ctr1_gpus" +echo "Pod gpu-test5/pod0, container ts-ctr1 claimed $gpu_test5_pod0_ts_ctr1_gpu" +if [[ "$gpu_test5_pod0_ts_ctr1_gpu" != "$gpu_test5_pod0_ts_ctr0_gpu" ]]; then + echo "Pod gpu-test5/pod0, container ts-ctr1 should claim the same GPU as Pod gpu-test5/pod0, container ts-ctr0, but did not" + exit 1 +fi +gpu_test5_pod0_ts_ctr1_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test5_pod0_ts_ctr1_logs" $(gpu-id "$gpu_test5_pod0_ts_ctr1_gpu")) +if [[ "$gpu_test5_pod0_ts_ctr1_sharing_strategy" != "TimeSlicing" ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr1 to have sharing strategy TimeSlicing, got $gpu_test5_pod0_ts_ctr1_sharing_strategy" + exit 1 +fi +gpu_test5_pod0_ts_ctr1_timeslice_interval=$(gpu-timeslice-interval-from-logs "$gpu_test5_pod0_ts_ctr1_logs" $(gpu-id "$gpu_test5_pod0_ts_ctr1_gpu")) +if [[ "$gpu_test5_pod0_ts_ctr1_timeslice_interval" != "Long" ]]; then + echo "Expected Pod gpu-test5/pod0, container ts-ctr1 to have timeslice interval Long, got $gpu_test5_pod0_ts_ctr1_timeslice_interval" + exit 1 +fi + +gpu_test5_pod0_sp_ctr0_logs=$(kubectl logs -n gpu-test5 pod0 -c sp-ctr0) +gpu_test5_pod0_sp_ctr0_gpus=$(gpus-from-logs "$gpu_test5_pod0_sp_ctr0_logs") +gpu_test5_pod0_sp_ctr0_gpus_count=$(echo "$gpu_test5_pod0_sp_ctr0_gpus" | wc -w) +if [[ $gpu_test5_pod0_sp_ctr0_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr0 to have 1 GPU, but got $gpu_test5_pod0_sp_ctr0_gpus_count: $gpu_test5_pod0_sp_ctr0_gpus" + exit 1 +fi +gpu_test5_pod0_sp_ctr0_gpu="$gpu_test5_pod0_sp_ctr0_gpus" +if gpu-already-seen "$gpu_test5_pod0_sp_ctr0_gpu"; then + echo "Pod gpu-test5/pod0, container sp-ctr0 should have a new GPU but claimed $gpu_test5_pod0_sp_ctr0_gpu which is already claimed" + exit 1 +fi +echo "Pod gpu-test5/pod0, container sp-ctr0 claimed $gpu_test5_pod0_sp_ctr0_gpu" +observed_gpus+=("$gpu_test5_pod0_sp_ctr0_gpu") +gpu_test5_pod0_sp_ctr0_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test5_pod0_sp_ctr0_logs" $(gpu-id "$gpu_test5_pod0_sp_ctr0_gpu")) +if [[ "$gpu_test5_pod0_sp_ctr0_sharing_strategy" != "SpacePartitioning" ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr0 to have sharing strategy SpacePartitioning, got $gpu_test5_pod0_sp_ctr0_sharing_strategy" + exit 1 +fi +gpu_test5_pod0_sp_ctr0_partition_count=$(gpu-partition-count-from-logs "$gpu_test5_pod0_sp_ctr0_logs" $(gpu-id "$gpu_test5_pod0_sp_ctr0_gpu")) +if [[ "$gpu_test5_pod0_sp_ctr0_partition_count" != "10" ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr0 to have partition count 10, got $gpu_test5_pod0_sp_ctr0_partition_count" + exit 1 +fi + +gpu_test5_pod0_sp_ctr1_logs=$(kubectl logs -n gpu-test5 pod0 -c sp-ctr1) +gpu_test5_pod0_sp_ctr1_gpus=$(gpus-from-logs "$gpu_test5_pod0_sp_ctr1_logs") +gpu_test5_pod0_sp_ctr1_gpus_count=$(echo "$gpu_test5_pod0_sp_ctr1_gpus" | wc -w) +if [[ $gpu_test5_pod0_sp_ctr1_gpus_count != 1 ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr1 to have 1 GPU, but got $gpu_test5_pod0_sp_ctr1_gpus_count: $gpu_test5_pod0_sp_ctr1_gpus" + exit 1 +fi +gpu_test5_pod0_sp_ctr1_gpu="$gpu_test5_pod0_sp_ctr1_gpus" +echo "Pod gpu-test5/pod0, container sp-ctr1 claimed $gpu_test5_pod0_sp_ctr1_gpu" +if [[ "$gpu_test5_pod0_sp_ctr1_gpu" != "$gpu_test5_pod0_sp_ctr0_gpu" ]]; then + echo "Pod gpu-test5/pod0, container sp-ctr1 should claim the same GPU as Pod gpu-test5/pod0, container sp-ctr0, but did not" + exit 1 +fi +gpu_test5_pod0_sp_ctr1_sharing_strategy=$(gpu-sharing-strategy-from-logs "$gpu_test5_pod0_sp_ctr1_logs" $(gpu-id "$gpu_test5_pod0_sp_ctr1_gpu")) +if [[ "$gpu_test5_pod0_sp_ctr1_sharing_strategy" != "SpacePartitioning" ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr1 to have sharing strategy SpacePartitioning, got $gpu_test5_pod0_sp_ctr1_sharing_strategy" + exit 1 +fi +gpu_test5_pod0_sp_ctr1_partition_count=$(gpu-partition-count-from-logs "$gpu_test5_pod0_sp_ctr1_logs" $(gpu-id "$gpu_test5_pod0_sp_ctr1_gpu")) +if [[ "$gpu_test5_pod0_sp_ctr1_partition_count" != "10" ]]; then + echo "Expected Pod gpu-test5/pod0, container sp-ctr1 to have partition count 10, got $gpu_test5_pod0_sp_ctr1_partition_count" + exit 1 +fi # test that deletion is fast (less than the default grace period of 30s) # see https://github.com/kubernetes/kubernetes/issues/127188 for details