@@ -56,10 +56,11 @@ type LocalCluster struct {
5656 customTokenizers string
5757
5858 // resources
59- dcli * docker.Client
60- net cnet
61- zeros []* zero
62- alphas []* alpha
59+ dcli * docker.Client
60+ net cnet
61+ netMutex sync.Mutex // protects network recreation
62+ zeros []* zero
63+ alphas []* alpha
6364}
6465
6566// UpgradeStrategy is an Enum that defines various upgrade strategies
@@ -167,18 +168,42 @@ func (c *LocalCluster) init() error {
167168
168169func (c * LocalCluster ) createNetwork () error {
169170 c .net .name = c .conf .prefix + "-net"
171+
172+ ctx , cancel := context .WithTimeout (context .Background (), requestTimeout )
173+ defer cancel ()
174+
175+ // Check if network already exists
176+ existingNet , err := c .dcli .NetworkInspect (ctx , c .net .name , network.InspectOptions {})
177+ if err == nil {
178+ // Network exists, reuse it
179+ log .Printf ("[INFO] reusing existing network %s (ID: %s)" , c .net .name , existingNet .ID )
180+ c .net .id = existingNet .ID
181+ return nil
182+ }
183+
184+ // Network doesn't exist, create it
170185 opts := network.CreateOptions {
171186 Driver : "bridge" ,
172187 IPAM : & network.IPAM {Driver : "default" },
173188 }
174189
175- ctx , cancel := context .WithTimeout (context .Background (), requestTimeout )
176- defer cancel ()
177- network , err := c .dcli .NetworkCreate (ctx , c .net .name , opts )
190+ networkResp , err := c .dcli .NetworkCreate (ctx , c .net .name , opts )
178191 if err != nil {
192+ // If network already exists (race condition), try to inspect and reuse it
193+ if strings .Contains (err .Error (), "already exists" ) {
194+ log .Printf ("[INFO] network %s already exists (race condition), inspecting" , c .net .name )
195+ existingNet , inspectErr := c .dcli .NetworkInspect (ctx , c .net .name , network.InspectOptions {})
196+ if inspectErr == nil {
197+ log .Printf ("[INFO] reusing existing network %s (ID: %s)" , c .net .name , existingNet .ID )
198+ c .net .id = existingNet .ID
199+ return nil
200+ }
201+ // If inspect also fails, return original create error
202+ log .Printf ("[WARNING] failed to inspect network after creation conflict: %v" , inspectErr )
203+ }
179204 return errors .Wrap (err , "error creating network" )
180205 }
181- c .net .id = network .ID
206+ c .net .id = networkResp .ID
182207
183208 return nil
184209}
@@ -256,6 +281,27 @@ func (c *LocalCluster) createContainer(dc dnode) (string, error) {
256281 return "" , err
257282 }
258283
284+ // Verify the network still exists before creating container
285+ ctx , cancel := context .WithTimeout (context .Background (), requestTimeout )
286+ defer cancel ()
287+ if c .net .id != "" {
288+ _ , err := c .dcli .NetworkInspect (ctx , c .net .id , network.InspectOptions {})
289+ if err != nil {
290+ // Use mutex to prevent multiple goroutines from recreating network simultaneously
291+ c .netMutex .Lock ()
292+ // Double-check after acquiring lock - another goroutine may have recreated it
293+ _ , recheckErr := c .dcli .NetworkInspect (ctx , c .net .id , network.InspectOptions {})
294+ if recheckErr != nil {
295+ log .Printf ("[WARNING] network %s (ID: %s) not found, recreating" , c .net .name , c .net .id )
296+ if err := c .createNetwork (); err != nil {
297+ c .netMutex .Unlock ()
298+ return "" , errors .Wrap (err , "error recreating network" )
299+ }
300+ }
301+ c .netMutex .Unlock ()
302+ }
303+ }
304+
259305 cconf := & container.Config {Cmd : cmd , Image : image , WorkingDir : dc .workingDir (), ExposedPorts : dc .ports ()}
260306 hconf := & container.HostConfig {Mounts : mts , PublishAllPorts : true , PortBindings : dc .bindings (c .conf .portOffset )}
261307 networkConfig := & network.NetworkingConfig {
@@ -267,8 +313,6 @@ func (c *LocalCluster) createContainer(dc dnode) (string, error) {
267313 },
268314 }
269315
270- ctx , cancel := context .WithTimeout (context .Background (), requestTimeout )
271- defer cancel ()
272316 resp , err := c .dcli .ContainerCreate (ctx , cconf , hconf , networkConfig , nil , dc .cname ())
273317 if err != nil {
274318 return "" , errors .Wrapf (err , "error creating container %v" , dc .cname ())
@@ -394,16 +438,28 @@ func (c *LocalCluster) cleanupDocker() error {
394438 // Prune containers
395439 contsReport , err := c .dcli .ContainersPrune (ctx , filters.Args {})
396440 if err != nil {
397- log .Fatalf ("[ERROR] Error pruning containers: %v" , err )
441+ // Don't fail if prune is already running - just skip it
442+ if strings .Contains (err .Error (), "already running" ) {
443+ log .Printf ("[WARNING] Skipping container prune - operation already running" )
444+ } else {
445+ log .Printf ("[WARNING] Error pruning containers: %v" , err )
446+ }
447+ } else {
448+ log .Printf ("[INFO] Pruned containers: %+v\n " , contsReport )
398449 }
399- log .Printf ("[INFO] Pruned containers: %+v\n " , contsReport )
400450
401451 // Prune networks
402452 netsReport , err := c .dcli .NetworksPrune (ctx , filters.Args {})
403453 if err != nil {
404- log .Fatalf ("[ERROR] Error pruning networks: %v" , err )
454+ // Don't fail if prune is already running - just skip it
455+ if strings .Contains (err .Error (), "already running" ) {
456+ log .Printf ("[WARNING] Skipping network prune - operation already running" )
457+ } else {
458+ log .Printf ("[WARNING] Error pruning networks: %v" , err )
459+ }
460+ } else {
461+ log .Printf ("[INFO] Pruned networks: %+v\n " , netsReport )
405462 }
406- log .Printf ("[INFO] Pruned networks: %+v\n " , netsReport )
407463
408464 return nil
409465}
@@ -493,6 +549,22 @@ func (c *LocalCluster) StartAlpha(id int) error {
493549func (c * LocalCluster ) startContainer (dc dnode ) error {
494550 ctx , cancel := context .WithTimeout (context .Background (), requestTimeout )
495551 defer cancel ()
552+
553+ // verify the container still exists
554+ _ , err := c .dcli .ContainerInspect (ctx , dc .cid ())
555+ if err != nil {
556+ log .Printf ("[WARNING] container %s (ID: %s) not found, attempting to recreate" , dc .cname (), dc .cid ())
557+ newCID , createErr := c .createContainer (dc )
558+ if createErr != nil {
559+ return errors .Wrapf (createErr , "error recreating missing container [%v]" , dc .cname ())
560+ }
561+ switch node := dc .(type ) {
562+ case * alpha , * zero :
563+ node .setContainerID (newCID )
564+ }
565+ log .Printf ("[INFO] successfully recreated container %s with new ID: %s" , dc .cname (), newCID )
566+ }
567+
496568 if err := c .dcli .ContainerStart (ctx , dc .cid (), container.StartOptions {}); err != nil {
497569 return errors .Wrapf (err , "error starting container [%v]" , dc .cname ())
498570 }
@@ -634,15 +706,15 @@ func (c *LocalCluster) containerHealthCheck(url func(c *LocalCluster) (string, e
634706
635707 req , err := http .NewRequest (http .MethodGet , endpoint , nil )
636708 if err != nil {
637- if attempt > 10 {
638- log .Printf ("[WARNING] error building req for endpoint [%v], err: [%v]" , endpoint , err )
709+ if attempt > 50 {
710+ log .Printf ("[WARNING] problem building req for endpoint [%v], err: [%v]" , endpoint , err )
639711 }
640712 continue
641713 }
642714 body , err := dgraphapi .DoReq (req )
643715 if err != nil {
644- if attempt > 10 {
645- log .Printf ("[WARNING] error hitting health endpoint [%v], err: [%v]" , endpoint , err )
716+ if attempt > 50 {
717+ log .Printf ("[WARNING] problem hitting health endpoint [%v], err: [%v]" , endpoint , err )
646718 }
647719 continue
648720 }
@@ -691,8 +763,8 @@ func (c *LocalCluster) waitUntilLogin() error {
691763 log .Printf ("[INFO] login succeeded" )
692764 return nil
693765 }
694- if attempt > 10 {
695- log .Printf ("[WARNING] error trying to login: %v" , err )
766+ if attempt > 5 {
767+ log .Printf ("[WARNING] problem trying to login: %v" , err )
696768 }
697769 time .Sleep (waitDurBeforeRetry )
698770 }
@@ -876,7 +948,7 @@ func (c *LocalCluster) Client() (*dgraphapi.GrpcClient, func(), error) {
876948 cleanup := func () {
877949 for _ , conn := range conns {
878950 if err := conn .Close (); err != nil {
879- log .Printf ("[WARNING] error closing connection: %v" , err )
951+ log .Printf ("[WARNING] problem closing connection: %v" , err )
880952 }
881953 }
882954 }
@@ -897,7 +969,7 @@ func (c *LocalCluster) AlphaClient(id int) (*dgraphapi.GrpcClient, func(), error
897969 client := dgo .NewDgraphClient (api .NewDgraphClient (conn ))
898970 cleanup := func () {
899971 if err := conn .Close (); err != nil {
900- log .Printf ("[WARNING] error closing connection: %v" , err )
972+ log .Printf ("[WARNING] problem closing connection: %v" , err )
901973 }
902974 }
903975 return & dgraphapi.GrpcClient {Dgraph : client }, cleanup , nil
0 commit comments