Skip to content

Commit 72c7a84

Browse files
committed
lima-guestagent: Handle SIGTERM, support save/restore eventState on stop/start
`eventState` is saved at `/run/lima-guestagent/event-state.json`. The saved `eventState` is expected to be removed on OS restart. Signed-off-by: Norio Nomura <[email protected]> # Conflicts: # pkg/guestagent/api/server/server.go
1 parent 6f010b0 commit 72c7a84

File tree

9 files changed

+176
-26
lines changed

9 files changed

+176
-26
lines changed

cmd/lima-guestagent/daemon_linux.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"errors"
88
"net"
99
"os"
10+
"os/signal"
11+
"syscall"
1012
"time"
1113

1214
"github.com/mdlayher/vsock"
@@ -26,6 +28,7 @@ func newDaemonCommand() *cobra.Command {
2628
Short: "Run the daemon",
2729
RunE: daemonAction,
2830
}
31+
daemonCommand.Flags().String("runtime-dir", "/run/lima-guestagent", "Directory to store runtime state")
2932
daemonCommand.Flags().Duration("tick", 3*time.Second, "Tick for polling events")
3033
daemonCommand.Flags().Int("vsock-port", 0, "Use vsock server instead a UNIX socket")
3134
daemonCommand.Flags().String("virtio-port", "", "Use virtio server instead a UNIX socket")
@@ -34,6 +37,13 @@ func newDaemonCommand() *cobra.Command {
3437

3538
func daemonAction(cmd *cobra.Command, _ []string) error {
3639
ctx := cmd.Context()
40+
runtimeDir, err := cmd.Flags().GetString("runtime-dir")
41+
if err != nil {
42+
return err
43+
}
44+
if err := os.MkdirAll(runtimeDir, 0o755); err != nil {
45+
return err
46+
}
3747
socket := "/run/lima-guestagent.sock"
3848
tick, err := cmd.Flags().GetDuration("tick")
3949
if err != nil {
@@ -66,11 +76,16 @@ func daemonAction(cmd *cobra.Command, _ []string) error {
6676
tickerInst = ticker.NewCompoundTicker(simpleTicker, ebpfTicker)
6777
}
6878

69-
agent, err := guestagent.New(ctx, tickerInst)
79+
ctx, stop := signal.NotifyContext(ctx, syscall.SIGTERM)
80+
defer stop()
81+
go func() {
82+
<-ctx.Done()
83+
logrus.Debug("Received SIGTERM, shutting down the guest agent")
84+
}()
85+
agent, err := guestagent.New(ctx, tickerInst, runtimeDir)
7086
if err != nil {
7187
return err
7288
}
73-
defer agent.Close()
7489

7590
err = os.RemoveAll(socket)
7691
if err != nil {
@@ -104,5 +119,6 @@ func daemonAction(cmd *cobra.Command, _ []string) error {
104119
l = socketL
105120
logrus.Infof("serving the guest agent on %q", socket)
106121
}
107-
return server.StartServer(l, &server.GuestServer{Agent: agent, TunnelS: portfwdserver.NewTunnelServer()})
122+
defer logrus.Debug("exiting lima-guestagent daemon")
123+
return server.StartServer(ctx, l, &server.GuestServer{Agent: agent, TunnelS: portfwdserver.NewTunnelServer()})
108124
}

cmd/lima-guestagent/lima-guestagent.TEMPLATE.service

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Description=lima-guestagent
33

44
[Service]
5-
ExecStart={{.Binary}} daemon {{.Args}}
5+
ExecStart={{.Binary}} daemon {{.Args}} --runtime-dir="%t/%N"
66
Type=simple
77
Restart=on-failure
88
OOMPolicy=continue

cmd/lima-guestagent/main_linux.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@ import (
1111

1212
"github.com/lima-vm/lima/v2/cmd/yq"
1313
"github.com/lima-vm/lima/v2/pkg/debugutil"
14+
"github.com/lima-vm/lima/v2/pkg/osutil"
1415
"github.com/lima-vm/lima/v2/pkg/version"
1516
)
1617

1718
func main() {
1819
yq.MaybeRunYQ()
1920
if err := newApp().Execute(); err != nil {
21+
osutil.HandleExitError(err)
2022
logrus.Fatal(err)
2123
}
2224
}

pkg/guestagent/api/server/server.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"net"
99

10+
"github.com/sirupsen/logrus"
1011
"google.golang.org/grpc"
1112
"google.golang.org/grpc/keepalive"
1213
"google.golang.org/protobuf/types/known/emptypb"
@@ -16,7 +17,7 @@ import (
1617
"github.com/lima-vm/lima/v2/pkg/portfwdserver"
1718
)
1819

19-
func StartServer(lis net.Listener, guest *GuestServer) error {
20+
func StartServer(ctx context.Context, lis net.Listener, guest *GuestServer) error {
2021
server := grpc.NewServer(
2122
grpc.InitialWindowSize(512<<20),
2223
grpc.InitialConnWindowSize(512<<20),
@@ -26,7 +27,19 @@ func StartServer(lis net.Listener, guest *GuestServer) error {
2627
grpc.KeepaliveParams(keepalive.ServerParameters{Time: 0, Timeout: 0, MaxConnectionIdle: 0}),
2728
)
2829
api.RegisterGuestServiceServer(server, guest)
29-
return server.Serve(lis)
30+
go func() {
31+
<-ctx.Done()
32+
logrus.Debug("Stopping the gRPC server")
33+
server.GracefulStop()
34+
logrus.Debug("Closing the listener used by the gRPC server")
35+
lis.Close()
36+
}()
37+
err := server.Serve(lis)
38+
// grpc.Server.Serve() expects to return a non-nil error caused by lis.Accept()
39+
if opErr, ok := err.(*net.OpError); ok && opErr.Err.Error() == "use of closed network connection" {
40+
return nil
41+
}
42+
return err
3043
}
3144

3245
type GuestServer struct {
@@ -41,6 +54,7 @@ func (s *GuestServer) GetInfo(ctx context.Context, _ *emptypb.Empty) (*api.Info,
4154

4255
func (s *GuestServer) GetEvents(_ *emptypb.Empty, stream api.GuestService_GetEventsServer) error {
4356
responses := make(chan *api.Event)
57+
// expects Events() to close the channel when stream.Context() is done or ticker stops
4458
go s.Agent.Events(stream.Context(), responses)
4559
for response := range responses {
4660
err := stream.Send(response)

pkg/guestagent/guestagent_linux.go

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ package guestagent
55

66
import (
77
"context"
8+
"encoding/json"
9+
"errors"
810
"os"
11+
"path/filepath"
912
"reflect"
1013
"time"
1114

@@ -19,7 +22,7 @@ import (
1922
"github.com/lima-vm/lima/v2/pkg/guestagent/timesync"
2023
)
2124

22-
func New(ctx context.Context, ticker ticker.Ticker) (Agent, error) {
25+
func New(ctx context.Context, ticker ticker.Ticker, runtimeDir string) (Agent, error) {
2326
socketsLister, err := sockets.NewLister()
2427
if err != nil {
2528
return nil, err
@@ -28,25 +31,37 @@ func New(ctx context.Context, ticker ticker.Ticker) (Agent, error) {
2831
ticker: ticker,
2932
socketLister: socketsLister,
3033
kubernetesServiceWatcher: kubernetesservice.NewServiceWatcher(),
34+
runtimeDir: runtimeDir,
3135
}
3236

3337
go a.kubernetesServiceWatcher.Start(ctx)
34-
go a.fixSystemTimeSkew()
38+
go a.fixSystemTimeSkew(ctx)
39+
40+
go func() {
41+
<-ctx.Done()
42+
logrus.Debug("Closing the agent")
43+
if err := a.Close(); err != nil {
44+
logrus.Errorf("error on agent.Close(): %v", err)
45+
}
46+
}()
3547

3648
return a, nil
3749
}
3850

51+
var _ Agent = (*agent)(nil)
52+
3953
type agent struct {
4054
// Ticker is like time.Ticker.
4155
// We can't use inotify for /proc/net/tcp, so we need this ticker to
4256
// reload /proc/net/tcp.
4357
ticker ticker.Ticker
4458
socketLister *sockets.Lister
4559
kubernetesServiceWatcher *kubernetesservice.ServiceWatcher
60+
runtimeDir string
4661
}
4762

4863
type eventState struct {
49-
ports []*api.IPPort
64+
Ports []*api.IPPort `json:"ports,omitempty"`
5065
}
5166

5267
func comparePorts(old, neww []*api.IPPort) (added, removed []*api.IPPort) {
@@ -82,13 +97,13 @@ func (a *agent) collectEvent(ctx context.Context, st eventState) (*api.Event, ev
8297
err error
8398
)
8499
newSt := st
85-
newSt.ports, err = a.LocalPorts(ctx)
100+
newSt.Ports, err = a.LocalPorts(ctx)
86101
if err != nil {
87102
ev.Errors = append(ev.Errors, err.Error())
88103
ev.Time = timestamppb.Now()
89104
return ev, newSt
90105
}
91-
ev.AddedLocalPorts, ev.RemovedLocalPorts = comparePorts(st.ports, newSt.ports)
106+
ev.AddedLocalPorts, ev.RemovedLocalPorts = comparePorts(st.Ports, newSt.Ports)
92107
ev.Time = timestamppb.Now()
93108
return ev, newSt
94109
}
@@ -102,8 +117,16 @@ func isEventEmpty(ev *api.Event) bool {
102117
func (a *agent) Events(ctx context.Context, ch chan *api.Event) {
103118
defer close(ch)
104119
tickerCh := a.ticker.Chan()
105-
defer a.ticker.Stop()
106-
var st eventState
120+
121+
st, err := a.LoadEventState()
122+
if err != nil {
123+
logrus.Errorf("failed to load state: %v", err)
124+
}
125+
defer func() {
126+
if err := a.SaveEventState(st); err != nil {
127+
logrus.Errorf("failed to save state: %v", err)
128+
}
129+
}()
107130
for {
108131
var ev *api.Event
109132
ev, st = a.collectEvent(ctx, st)
@@ -115,6 +138,7 @@ func (a *agent) Events(ctx context.Context, ch chan *api.Event) {
115138
return
116139
case _, ok := <-tickerCh:
117140
if !ok {
141+
logrus.Debug("ticker channel closed")
118142
return
119143
}
120144
logrus.Debug("tick!")
@@ -190,7 +214,7 @@ func (a *agent) Info(ctx context.Context) (*api.Info, error) {
190214

191215
const deltaLimit = 2 * time.Second
192216

193-
func (a *agent) fixSystemTimeSkew() {
217+
func (a *agent) fixSystemTimeSkew(ctx context.Context) {
194218
logrus.Info("fixSystemTimeSkew(): monitoring system time skew")
195219
for {
196220
ok, err := timesync.HasRTC()
@@ -217,6 +241,13 @@ func (a *agent) fixSystemTimeSkew() {
217241
logrus.Infof("fixSystemTimeSkew: system time synchronized with rtc")
218242
break
219243
}
244+
select {
245+
case <-ctx.Done():
246+
logrus.Debug("fixSystemTimeSkew: context done, exiting")
247+
ticker.Stop()
248+
return
249+
default:
250+
}
220251
}
221252
ticker.Stop()
222253
}
@@ -239,5 +270,42 @@ func (a *agent) Close() error {
239270
return err
240271
}
241272
}
273+
a.ticker.Stop()
242274
return nil
243275
}
276+
277+
const eventStateFileName = "event-state.json"
278+
279+
// LoadEventState loads the event state from a file in JSON format.
280+
// If the file does not exist, it returns an empty eventState with no error.
281+
// The saved eventState is expected to be removed on OS restart.
282+
func (a *agent) LoadEventState() (eventState, error) {
283+
logrus.Debug("Loading event state")
284+
path := filepath.Join(a.runtimeDir, eventStateFileName)
285+
data, err := os.ReadFile(path)
286+
if err != nil {
287+
if errors.Is(err, os.ErrNotExist) {
288+
return eventState{}, nil
289+
}
290+
return eventState{}, err
291+
}
292+
var st eventState
293+
if err := json.Unmarshal(data, &st); err != nil {
294+
return eventState{}, err
295+
}
296+
// We don't remove the file after loading for debugging purposes.
297+
return st, nil
298+
}
299+
300+
// SaveEventState saves the event state to a file in JSON format.
301+
// It overwrites the file if it already exists.
302+
// The saved eventState is expected to be removed on OS restart.
303+
func (a *agent) SaveEventState(st eventState) error {
304+
logrus.Debug("Saving event state")
305+
data, err := json.Marshal(st)
306+
if err != nil {
307+
return err
308+
}
309+
path := filepath.Join(a.runtimeDir, eventStateFileName)
310+
return os.WriteFile(path, data, 0o644)
311+
}

pkg/guestagent/ticker/compound.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ package ticker
55

66
import (
77
"time"
8+
9+
"github.com/sirupsen/logrus"
810
)
911

1012
func NewCompoundTicker(t1, t2 Ticker) Ticker {
@@ -15,10 +17,13 @@ type compoundTicker struct {
1517
t1, t2 Ticker
1618
}
1719

20+
var _ Ticker = (*compoundTicker)(nil)
21+
1822
func (ticker *compoundTicker) Chan() <-chan time.Time {
1923
ch := make(chan time.Time)
2024
go func() {
21-
defer ticker.Stop()
25+
defer close(ch)
26+
defer logrus.Debug("compoundTicker: exiting")
2227
for {
2328
select {
2429
case v, ok := <-ticker.t1.Chan():

0 commit comments

Comments
 (0)