diff --git a/lxd/locking/lock.go b/lxd/locking/lock.go index 0d2c4c9043ef..4fe9f0385984 100644 --- a/lxd/locking/lock.go +++ b/lxd/locking/lock.go @@ -69,3 +69,20 @@ func Lock(ctx context.Context, lockName string) (UnlockFunc, error) { } } } + +// TryLock attempts to acquire a lock without blocking. If it succeeds it returns an unlock +// function and true. If it fails to acquire the lock it returns nil and false. +func TryLock(lockName string) (UnlockFunc, bool) { + // Create and cancel a new context. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + // Call Lock with an already cancelled context. + // In case the lock cannot be acquired return without blocking. + unlock, err := Lock(ctx, lockName) + if err != nil { + return nil, false + } + + return unlock, true +} diff --git a/lxd/storage/connectors/utils.go b/lxd/storage/connectors/utils.go index 75ced56d43f4..510d41d0a9a3 100644 --- a/lxd/storage/connectors/utils.go +++ b/lxd/storage/connectors/utils.go @@ -40,9 +40,35 @@ func connect(ctx context.Context, c Connector, targetQN string, targetAddrs []st // to race conditions if other connection attempts are still ongoing. // For the same reason, relying on a higher-level lock from the caller // (e.g., the storage driver) is insufficient. - unlock, err := locking.Lock(ctx, targetQN) - if err != nil { - return nil, err + unlock, ok := locking.TryLock(targetQN) + if !ok { + // The connectors lock is already acquired. + // This can happen in the following scenario: + // (1) Another caller is already in the process of connecting to the target(s). + // In this case we can rely on this connection to either succeed or fail and simply return. + // In case the connection attempt of the preceding caller is failing, the volume + // discovery of the subsequent caller which happens right after the connection will run into a timeout (default 30s). + // Even if we would try to connect ourselves after the lock gets released, the possibilites are high that this + // connection attempt will also fail. + // The only difference is that the subsequent caller who hasn't obtained the lock + // will see a different error than the one reported to the preceding caller. + // The preceding caller will likely get an error from the transport layer + // whereas all of the subsequent callers will see errors that the volume cannot be discovered (due to the non existing connection). + // + // By default the connection attempt's timeout is 30s. Also the volume discovery timeout is set to 30s. + // This means if the preceding callers connection attempt succeeds after 29s, it has another 30s to discover the actual volume. + // Contrary any subsequent callers will only wait for up to 30s for the volume to appear. + // Therefore it's important to not set custom context deadlines for the context passed into the connection attempt and + // the one used for volume discovery. + // The timeout used for volume discovery should be at least the same length as the one for the connection attempt. + // + // There cannot be a situation in which the connectors lock is held for an ongoing disconnect. + // This is protected by the outer lock on the storage driver level which synchronizes both + // the map (connect) and unmap (disconnect) attempts. + // The connectors lock is never acquired for disconnects as those are either + // (1) protected by the lock acquired for making the connection + // (2) or synchronized by the higher-level storage driver lock. + return nil, nil } // Once the lock is obtained, search for an existing session. @@ -55,7 +81,7 @@ func connect(ctx context.Context, c Connector, targetQN string, targetAddrs []st // continue after the first successful connection (which causes the function // to exit). The context is manually cancelled once all attempts complete. var cancel context.CancelFunc - _, ok := ctx.Deadline() + _, ok = ctx.Deadline() if !ok { // Set a default timeout of 30 seconds for the context // if no deadline is already configured.