-
-
Notifications
You must be signed in to change notification settings - Fork 512
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[client] Fix state manager race conditions #2890
base: main
Are you sure you want to change the base?
Changes from all commits
5b38c56
3c95f6f
00a4edc
cd0dbae
e07caa8
9a56fc0
81f0810
41c9c39
3c581d8
e1af056
eceab36
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,18 +71,20 @@ func (m *Manager) Stop(ctx context.Context) error { | |
return nil | ||
} | ||
|
||
var cancel context.CancelFunc | ||
m.mu.Lock() | ||
defer m.mu.Unlock() | ||
cancel = m.cancel | ||
m.mu.Unlock() | ||
|
||
if m.cancel != nil { | ||
m.cancel() | ||
if cancel == nil { | ||
return nil | ||
} | ||
cancel() | ||
|
||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case <-m.done: | ||
return nil | ||
} | ||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case <-m.done: | ||
} | ||
|
||
return nil | ||
|
@@ -179,14 +181,18 @@ func (m *Manager) PersistState(ctx context.Context) error { | |
return nil | ||
} | ||
|
||
bs, err := marshalWithPanicRecovery(m.states) | ||
if err != nil { | ||
return fmt.Errorf("marshal states: %w", err) | ||
} | ||
|
||
ctx, cancel := context.WithTimeout(ctx, 5*time.Second) | ||
defer cancel() | ||
|
||
done := make(chan error, 1) | ||
|
||
start := time.Now() | ||
go func() { | ||
done <- util.WriteJsonWithRestrictedPermission(ctx, m.filePath, m.states) | ||
done <- util.WriteBytesWithRestrictedPermission(ctx, m.filePath, bs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What will be if this function running more then 10 sec? The ticker will start a PersistState call and will be a conflict in the file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's what the ctx check and deadline is for in this fn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But the check and move is not an atomic operation. If the code runs parallel this lines with two different ctx the outcome is unpredictable.
|
||
}() | ||
|
||
select { | ||
|
@@ -286,3 +292,19 @@ func (m *Manager) PerformCleanup() error { | |
|
||
return nberrors.FormatErrorOrNil(merr) | ||
} | ||
|
||
func marshalWithPanicRecovery(v any) ([]byte, error) { | ||
var bs []byte | ||
var err error | ||
|
||
func() { | ||
defer func() { | ||
if r := recover(); r != nil { | ||
err = fmt.Errorf("panic during marshal: %v", r) | ||
} | ||
}() | ||
bs, err = json.Marshal(v) | ||
}() | ||
|
||
return bs, err | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package statemanager | ||
|
||
import ( | ||
"context" | ||
"os" | ||
"path/filepath" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
// MockState implements the State interface for testing | ||
type MockState struct { | ||
} | ||
|
||
func (m MockState) Name() string { | ||
return "mock_state" | ||
} | ||
|
||
func (m MockState) Cleanup() error { | ||
return nil | ||
} | ||
|
||
func TestManager_PersistState_SlowWrite(t *testing.T) { | ||
tmpDir := t.TempDir() | ||
|
||
tests := []struct { | ||
name string | ||
contextTimeout time.Duration | ||
expectError bool | ||
errorType error | ||
}{ | ||
{ | ||
name: "write completes before deadline", | ||
contextTimeout: 1 * time.Second, | ||
expectError: false, | ||
}, | ||
{ | ||
name: "write exceeds deadline", | ||
contextTimeout: 0, | ||
expectError: true, | ||
errorType: context.DeadlineExceeded, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
stateFile := filepath.Join(tmpDir, tt.name+"-state.json") | ||
|
||
file, err := os.Create(stateFile) | ||
require.NoError(t, err) | ||
defer file.Close() | ||
|
||
m := New(stateFile) | ||
|
||
// Register and update mock state | ||
mockState := &MockState{} | ||
m.RegisterState(mockState) | ||
err = m.UpdateState(mockState) | ||
require.NoError(t, err) | ||
|
||
// Create context with timeout | ||
ctx, cancel := context.WithTimeout(context.Background(), tt.contextTimeout) | ||
defer cancel() | ||
|
||
// Attempt to persist state | ||
err = m.PersistState(ctx) | ||
|
||
if tt.expectError { | ||
assert.Error(t, err) | ||
assert.Equal(t, context.DeadlineExceeded, err) | ||
assert.Len(t, m.dirty, 1) | ||
} else { | ||
assert.NoError(t, err) | ||
assert.FileExists(t, stateFile) | ||
assert.Empty(t, m.dirty) | ||
} | ||
}) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do not unlock the m.mu? It blocks the Updated function calls in the worst case for 5 sec. After this point, the code does not touch the m.state so make no sense to protect it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We still need to clean up the
dirty
map further down. If we unlock here we might cleardirty
with new entries that haven't been yet written.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The log running code (5 sec) really does not cause any issue outside of the statemanager?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can delay updating iptables/nftables/routes/dns on network map updates. Because of routes it might also delay p2p connections