Skip to content

Commit ef689d7

Browse files
committed
Fix for update workflow
When terminating a function and replacing it during an update, there was often an error about task precondition not met which meant having to try and or wait or being left in an inconsistent state. The new flow makes sure "Wait" is called in either code path and allows for a custom gap between the SIGTERM and SIGKILL through the grace_period env var - set as a Go duration. Signed-off-by: Alex Ellis (OpenFaaS Ltd) <[email protected]>
1 parent 854ec58 commit ef689d7

File tree

2 files changed

+44
-9
lines changed

2 files changed

+44
-9
lines changed

pkg/provider/handlers/function_list.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"log"
7+
"strings"
78
"time"
89

910
"github.com/containerd/containerd"
@@ -51,7 +52,10 @@ func ListFunctions(client *containerd.Client, namespace string) (map[string]*Fun
5152
name := c.ID()
5253
f, err := GetFunction(client, name, namespace)
5354
if err != nil {
54-
log.Printf("skipping %s, error: %s", name, err)
55+
if !strings.Contains(err.Error(), "unable to get IP address for container") {
56+
log.Printf("List functions, skipping: %s, error: %s", name, err)
57+
}
58+
5559
} else {
5660
functions[name] = &f
5761
}

pkg/service/service.go

+39-8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"log"
77
"os"
88
"path/filepath"
9+
"strings"
910
"sync"
1011
"time"
1112

@@ -45,10 +46,24 @@ func Remove(ctx context.Context, client *containerd.Client, name string) error {
4546
log.Printf("Status of %s is: %s\n", name, status.Status)
4647
}
4748

48-
log.Printf("Need to kill task: %s\n", name)
49-
if err = killTask(ctx, t); err != nil {
49+
var gracePeriod = time.Second * 30
50+
spec, err := t.Spec(ctx)
51+
if err == nil {
52+
for _, p := range spec.Process.Env {
53+
k, v, ok := strings.Cut(p, "=")
54+
if ok && k == "grace_period" {
55+
periodVal, err := time.ParseDuration(v)
56+
if err == nil {
57+
gracePeriod = periodVal
58+
}
59+
}
60+
}
61+
}
62+
63+
if err = killTask(ctx, t, gracePeriod); err != nil {
5064
return fmt.Errorf("error killing task %s, %s, %w", container.ID(), name, err)
5165
}
66+
5267
}
5368

5469
if err := container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil {
@@ -66,14 +81,13 @@ func Remove(ctx context.Context, client *containerd.Client, name string) error {
6681
}
6782

6883
// Adapted from Stellar - https://github.com/stellar
69-
func killTask(ctx context.Context, task containerd.Task) error {
70-
71-
killTimeout := 30 * time.Second
84+
func killTask(ctx context.Context, task containerd.Task, gracePeriod time.Duration) error {
7285

7386
wg := &sync.WaitGroup{}
7487
wg.Add(1)
7588
var err error
7689

90+
waited := false
7791
go func() {
7892
defer wg.Done()
7993
if task != nil {
@@ -89,18 +103,35 @@ func killTask(ctx context.Context, task containerd.Task) error {
89103

90104
select {
91105
case <-wait:
92-
task.Delete(ctx)
106+
waited = true
93107
return
94-
case <-time.After(killTimeout):
108+
case <-time.After(gracePeriod):
109+
log.Printf("Sending SIGKILL to: %s after: %s", task.ID(), gracePeriod.Round(time.Second).String())
95110
if err := task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil {
96-
log.Printf("error force killing container task: %s", err)
111+
log.Printf("error sending SIGKILL to task: %s", err)
97112
}
113+
98114
return
99115
}
100116
}
101117
}()
102118
wg.Wait()
103119

120+
if task != nil {
121+
if !waited {
122+
wait, err := task.Wait(ctx)
123+
if err != nil {
124+
log.Printf("error waiting on task after kill: %s", err)
125+
}
126+
127+
<-wait
128+
}
129+
130+
if _, err := task.Delete(ctx); err != nil {
131+
return err
132+
}
133+
}
134+
104135
return err
105136
}
106137

0 commit comments

Comments
 (0)