Skip to content

Commit a497787

Browse files
committed
fix(core): serialize stderr/process monitoring lifecycle (MCP-770)
StartStderrMonitoring (from connectStdio during a reconcile-driven Connect) and StopStderrMonitoring (from Disconnect during Manager.ShutdownAll) accessed the monitoring lifecycle fields — stderrMonitoringCtx/Cancel/WaitGroup and the process-monitoring equivalents — with no synchronization. When a connect and a shutdown overlap on the same client, the -race detector flags WG.Add (Start) vs WG.Wait (Stop). CI caught this on PR #555's ubuntu unit job once the earlier MCP-770 fixes (atomic Config) + the Windows temp-dir cleanup made the test's shutdown path actually run concurrently with reconcile. Add a dedicated monitoringMu that makes the four Start*/Stop*Monitoring methods mutually exclusive, so Add never overlaps Wait. The mutex is a leaf (monitor goroutines never acquire it; it is never held across c.mu), so Stop's bounded WG.Wait under the lock cannot deadlock. Add TestStderrMonitoring_StartStopRace: hammers Start/Stop concurrently; trips -race on the unsynchronized fields, green with monitoringMu. Related: MCP-770, spec 070
1 parent 59f8630 commit a497787

3 files changed

Lines changed: 68 additions & 0 deletions

File tree

internal/upstream/core/client.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ type Client struct {
7575
// Cached tools list from successful immediate call
7676
cachedTools []mcp.Tool
7777

78+
// monitoringMu serializes the stderr/process monitoring lifecycle methods
79+
// (Start*/Stop*Monitoring). Connect (StartStderrMonitoring) and Disconnect
80+
// (StopStderrMonitoring) can run concurrently on the same client during a
81+
// reconcile-vs-shutdown overlap, racing the ctx/cancel/WaitGroup fields
82+
// below (notably WG.Add vs WG.Wait). This mutex makes start and stop
83+
// mutually exclusive. It is never held across c.mu.
84+
monitoringMu sync.Mutex
85+
7886
// Stderr monitoring
7987
stderrMonitoringCtx context.Context
8088
stderrMonitoringCancel context.CancelFunc

internal/upstream/core/monitoring.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ const (
2424

2525
// StartStderrMonitoring starts monitoring stderr output and logging it
2626
func (c *Client) StartStderrMonitoring() {
27+
c.monitoringMu.Lock()
28+
defer c.monitoringMu.Unlock()
29+
2730
if c.stderr == nil || c.transportType != transportStdio {
2831
return
2932
}
@@ -43,6 +46,9 @@ func (c *Client) StartStderrMonitoring() {
4346

4447
// StopStderrMonitoring stops stderr monitoring
4548
func (c *Client) StopStderrMonitoring() {
49+
c.monitoringMu.Lock()
50+
defer c.monitoringMu.Unlock()
51+
4652
if c.stderrMonitoringCancel != nil {
4753
c.stderrMonitoringCancel()
4854

@@ -66,6 +72,9 @@ func (c *Client) StopStderrMonitoring() {
6672

6773
// StartProcessMonitoring starts monitoring the underlying process
6874
func (c *Client) StartProcessMonitoring() {
75+
c.monitoringMu.Lock()
76+
defer c.monitoringMu.Unlock()
77+
6978
// Start monitoring even if processCmd is nil for Docker containers
7079
if c.processCmd == nil && !c.isDockerCommand {
7180
return
@@ -94,6 +103,9 @@ func (c *Client) StartProcessMonitoring() {
94103

95104
// StopProcessMonitoring stops process monitoring
96105
func (c *Client) StopProcessMonitoring() {
106+
c.monitoringMu.Lock()
107+
defer c.monitoringMu.Unlock()
108+
97109
if c.processMonitorCancel != nil {
98110
c.processMonitorCancel()
99111

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package core
2+
3+
import (
4+
"strings"
5+
"sync"
6+
"testing"
7+
8+
"go.uber.org/zap"
9+
10+
"github.com/smart-mcp-proxy/mcpproxy-go/internal/config"
11+
)
12+
13+
// TestStderrMonitoring_StartStopRace reproduces the Connect-vs-Disconnect race
14+
// on the stderr-monitoring lifecycle fields (stderrMonitoringCtx/Cancel/WG).
15+
// StartStderrMonitoring runs from connectStdio during a reconcile-driven Connect
16+
// while StopStderrMonitoring runs from Disconnect during Manager.ShutdownAll, with
17+
// no synchronization on those fields — the -race detector flags WG.Add (Start)
18+
// vs WG.Wait (Stop). Run under `go test -race`: trips without monitoringMu, green
19+
// with it. A reused empty stderr reader returns EOF immediately so monitorStderr
20+
// exits at once and the loop stays fast.
21+
func TestStderrMonitoring_StartStopRace(t *testing.T) {
22+
c := &Client{
23+
transportType: transportStdio,
24+
stderr: strings.NewReader(""),
25+
logger: zap.NewNop(),
26+
config: &config.ServerConfig{Name: "race"},
27+
}
28+
29+
const iterations = 500
30+
var wg sync.WaitGroup
31+
wg.Add(2)
32+
33+
go func() {
34+
defer wg.Done()
35+
for i := 0; i < iterations; i++ {
36+
c.StartStderrMonitoring()
37+
}
38+
}()
39+
go func() {
40+
defer wg.Done()
41+
for i := 0; i < iterations; i++ {
42+
c.StopStderrMonitoring()
43+
}
44+
}()
45+
46+
wg.Wait()
47+
c.StopStderrMonitoring()
48+
}

0 commit comments

Comments
 (0)