@@ -5,8 +5,10 @@ import (
5
5
"context"
6
6
"fmt"
7
7
"io"
8
+ "log"
8
9
"strings"
9
10
"text/template"
11
+ "time"
10
12
11
13
"github.com/ohsu-comp-bio/funnel/tes"
12
14
v1 "k8s.io/api/batch/v1"
@@ -94,8 +96,22 @@ func (kcmd KubernetesCommand) Run(ctx context.Context) error {
94
96
95
97
_ , err = client .Create (ctx , job , metav1.CreateOptions {})
96
98
99
+ var maxRetries = 5
100
+
97
101
if err != nil {
98
- return fmt .Errorf ("creating job in worker: %v" , err )
102
+ // Retry creating the Executor Pod on failure
103
+ var retryCount int
104
+ for retryCount < maxRetries {
105
+ _ , err = client .Create (ctx , job , metav1.CreateOptions {})
106
+ if err == nil {
107
+ break
108
+ }
109
+ retryCount ++
110
+ time .Sleep (2 * time .Second )
111
+ }
112
+ if retryCount == maxRetries {
113
+ return fmt .Errorf ("Funnel Worker: Failed to create Executor Job after %v attempts: %v" , maxRetries , err )
114
+ }
99
115
}
100
116
101
117
// Wait until the job finishes
@@ -109,27 +125,65 @@ func (kcmd KubernetesCommand) Run(ctx context.Context) error {
109
125
}
110
126
111
127
for _ , v := range pods .Items {
112
- req := clientset .CoreV1 ().Pods (kcmd .Namespace ).GetLogs (v .Name , & corev1.PodLogOptions {})
113
- podLogs , err := req .Stream (ctx )
114
-
128
+ // Wait for the pod to reach Running state
129
+ pod , err := waitForPodRunning (ctx , kcmd .Namespace , v .Name , 5 * time .Minute )
115
130
if err != nil {
116
- return err
131
+ log . Fatalf ( "Error waiting for pod: %v" , err )
117
132
}
118
133
119
- defer podLogs .Close ()
120
- buf := new (bytes.Buffer )
121
- _ , err = io .Copy (buf , podLogs )
134
+ // Stream logs from the running pod
135
+ err = streamPodLogs (ctx , kcmd .Namespace , pod .Name , kcmd .Stdout )
122
136
if err != nil {
123
- return err
137
+ log . Fatalf ( "Error streaming logs: %v" , err )
124
138
}
125
-
126
- var bytes = buf .Bytes ()
127
- kcmd .Stdout .Write (bytes )
128
139
}
129
140
130
141
return nil
131
142
}
132
143
144
+ func waitForPodRunning (ctx context.Context , namespace string , podName string , timeout time.Duration ) (* corev1.Pod , error ) {
145
+ clientset , err := getKubernetesClientset ()
146
+ if err != nil {
147
+ return nil , fmt .Errorf ("failed getting kubernetes clientset: %v" , err )
148
+ }
149
+
150
+ ticker := time .NewTicker (2 * time .Second )
151
+ defer ticker .Stop ()
152
+
153
+ timeoutCh := time .After (timeout )
154
+
155
+ for {
156
+ select {
157
+ case <- timeoutCh :
158
+ return nil , fmt .Errorf ("timed out waiting for pod %s to be in running state" , podName )
159
+ case <- ticker .C :
160
+ pod , err := clientset .CoreV1 ().Pods (namespace ).Get (ctx , podName , metav1.GetOptions {})
161
+ if err != nil {
162
+ return nil , fmt .Errorf ("getting pod %s: %v" , podName , err )
163
+ }
164
+
165
+ return pod , nil
166
+ }
167
+ }
168
+ }
169
+
170
+ func streamPodLogs (ctx context.Context , namespace string , podName string , stdout io.Writer ) error {
171
+ clientset , err := getKubernetesClientset ()
172
+ if err != nil {
173
+ return fmt .Errorf ("getting kubernetes clientset: %v" , err )
174
+ }
175
+
176
+ req := clientset .CoreV1 ().Pods (namespace ).GetLogs (podName , & corev1.PodLogOptions {})
177
+ podLogs , err := req .Stream (ctx )
178
+ if err != nil {
179
+ return fmt .Errorf ("streaming logs: %v" , err )
180
+ }
181
+ defer podLogs .Close ()
182
+
183
+ _ , err = io .Copy (stdout , podLogs )
184
+ return err
185
+ }
186
+
133
187
// Deletes the job running the task.
134
188
func (kcmd KubernetesCommand ) Stop () error {
135
189
clientset , err := getKubernetesClientset ()
0 commit comments