From 088649e06c5bbc2f8ef5b9ae82c5b34d273c4138 Mon Sep 17 00:00:00 2001 From: Maxim Kim Date: Mon, 6 Apr 2026 13:42:36 -0700 Subject: [PATCH 1/2] Fix TlsHandler.FinishWrap null promise masking AuthenticationException Fix four interrelated bugs that cause NullReferenceException to mask the real AuthenticationException when TLS certificate validation fails: 1. Add catch clause in Decode's error handler so WrapAndFlush exceptions don't mask the original cause 2. Null-check Remove() return in Wrap to handle queue drained by re-entrant HandleFailure 3. Guard FinishWrap/FinishWrapNonAppData with _outboundClosed check to short-circuit writes after failure 4. Wrap _sslStream.Dispose() in Close with try-catch to prevent dispose exceptions from propagating Fixes https://github.com/maksimkim/SpanNetty/issues/60 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Tls/TlsHandler.Reader.cs | 14 +- .../Tls/TlsHandler.Writer.cs | 30 ++++ src/DotNetty.Handlers/Tls/TlsHandler.cs | 10 +- .../DotNetty.Handlers.Tests/TlsHandlerTest.cs | 136 ++++++++++++++++++ 4 files changed, 180 insertions(+), 10 deletions(-) diff --git a/src/DotNetty.Handlers/Tls/TlsHandler.Reader.cs b/src/DotNetty.Handlers/Tls/TlsHandler.Reader.cs index ad4c6b109..bfd1d91ac 100644 --- a/src/DotNetty.Handlers/Tls/TlsHandler.Reader.cs +++ b/src/DotNetty.Handlers/Tls/TlsHandler.Reader.cs @@ -180,15 +180,11 @@ protected override void Decode(IChannelHandlerContext context, IByteBuffer input // of the SSLException reported here. WrapAndFlush(context); } - // TODO revisit - //catch (IOException) - //{ - // if (s_logger.DebugEnabled) - // { - // s_logger.Debug("SSLException during trying to call SSLEngine.wrap(...)" + - // " because of an previous SSLException, ignoring...", ex); - // } - //} + catch (Exception) + { + // Swallow any exception from WrapAndFlush so it does not mask the original cause. + // See https://github.com/maksimkim/SpanNetty/issues/60 + } finally { HandleFailure(cause); diff --git a/src/DotNetty.Handlers/Tls/TlsHandler.Writer.cs b/src/DotNetty.Handlers/Tls/TlsHandler.Writer.cs index 8d40d88c0..f65ebc91e 100644 --- a/src/DotNetty.Handlers/Tls/TlsHandler.Writer.cs +++ b/src/DotNetty.Handlers/Tls/TlsHandler.Writer.cs @@ -150,6 +150,12 @@ private void Wrap(IChannelHandlerContext context) buf = null; var promise = _pendingUnencryptedWrites.Remove(); + if (promise is null) + { + // Queue was drained externally (e.g., by re-entrant HandleFailure → RemoveAndFailAll). + // See https://github.com/maksimkim/SpanNetty/issues/60 + break; + } Task task = _lastContextWriteTask; if (task is object) { @@ -172,6 +178,12 @@ private void Wrap(IChannelHandlerContext context) #if NETCOREAPP || NETSTANDARD_2_0_GREATER private void FinishWrap(in ReadOnlySpan buffer, IPromise promise) { + if (_outboundClosed) + { + _ = promise.TryComplete(); + return; + } + IByteBuffer output; var capturedContext = CapturedContext; if (buffer.IsEmpty) @@ -192,6 +204,12 @@ private void FinishWrap(in ReadOnlySpan buffer, IPromise promise) private void FinishWrap(byte[] buffer, int offset, int count, IPromise promise) { + if (_outboundClosed) + { + _ = promise.TryComplete(); + return; + } + IByteBuffer output; var capturedContext = CapturedContext; if (0u >= (uint)count) @@ -210,6 +228,12 @@ private void FinishWrap(byte[] buffer, int offset, int count, IPromise promise) #if NETCOREAPP || NETSTANDARD_2_0_GREATER private Task FinishWrapNonAppDataAsync(in ReadOnlyMemory buffer, IPromise promise) { + if (_outboundClosed) + { + _ = promise.TryComplete(); + return TaskUtil.Completed; + } + var capturedContext = CapturedContext; Task future; if (MemoryMarshal.TryGetArray(buffer, out var seg)) @@ -227,6 +251,12 @@ private Task FinishWrapNonAppDataAsync(in ReadOnlyMemory buffer, IPromise private Task FinishWrapNonAppDataAsync(byte[] buffer, int offset, int count, IPromise promise) { + if (_outboundClosed) + { + _ = promise.TryComplete(); + return TaskUtil.Completed; + } + var capturedContext = CapturedContext; var future = capturedContext.WriteAndFlushAsync(Unpooled.WrappedBuffer(buffer, offset, count), promise); this.ReadIfNeeded(capturedContext); diff --git a/src/DotNetty.Handlers/Tls/TlsHandler.cs b/src/DotNetty.Handlers/Tls/TlsHandler.cs index db037c3a0..6fde92e79 100644 --- a/src/DotNetty.Handlers/Tls/TlsHandler.cs +++ b/src/DotNetty.Handlers/Tls/TlsHandler.cs @@ -228,7 +228,15 @@ public override void Close(IChannelHandlerContext context, IPromise promise) { //CloseOutboundAndChannel(context, promise, false); _ = _closeFuture.TryComplete(); - _sslStream.Dispose(); + try + { + _sslStream.Dispose(); + } + catch (Exception) + { + // Swallow dispose exceptions to prevent them from propagating during channel close. + // See https://github.com/maksimkim/SpanNetty/issues/60 + } base.Close(context, promise); } diff --git a/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs b/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs index 71da5f07a..15b6b570e 100644 --- a/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs +++ b/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs @@ -6,6 +6,7 @@ namespace DotNetty.Handlers.Tests using System; using System.Collections.Generic; using System.Diagnostics; + using System.IO; using System.Linq; using System.Net.Security; using System.Runtime.InteropServices; @@ -378,5 +379,140 @@ public override void ChannelActive(IChannelHandlerContext context) } } } + + /// + /// Regression test for https://github.com/maksimkim/SpanNetty/issues/60 + /// Verifies that after the pending write queue is drained (as happens during + /// re-entrant HandleFailure → RemoveAndFailAll in production), the Wrap/Flush + /// code path handles the empty queue gracefully without crashing. + /// + /// Note: The exact re-entrant NRE scenario from issue #60 cannot be reliably + /// reproduced in a unit test because the NRE occurs inside a ContinueWith + /// continuation (silently swallowed by Task) and SslStream.Dispose() during + /// HandleFailure prevents reaching the Remove() call. This test verifies the + /// defensive fix paths are exercised correctly. + /// + [Fact] + public async Task WrapRemoveNull_ShouldNotThrowNullReferenceException() + { + var executor = new DefaultEventExecutor(); + try + { + var writeTasks = new List(); + var writeStrategy = new AsIsWriteStrategy(); + + X509Certificate2 tlsCertificate = TestResourceHelper.GetTestCertificate(); + string targetHost = tlsCertificate.GetNameInfo(X509NameType.DnsName, false); + + TlsHandler tlsHandler = new TlsHandler( + stream => new SslStream(stream, true, (sender, certificate, chain, errors) => true), + new ClientTlsSettings(SslProtocols.Tls12, false, new List(), targetHost)); + + var ch = new EmbeddedChannel(tlsHandler); + + // -- Complete the TLS handshake -- + IByteBuffer readResultBuffer = Unpooled.Buffer(4 * 1024); + Func, Task> readDataFunc = async output => + { + if (writeTasks.Count > 0) + { + await Task.WhenAll(writeTasks).WithTimeout(TestTimeout); + writeTasks.Clear(); + } + + if (readResultBuffer.ReadableBytes < output.Count) + { + if (ch.IsActive) + { +#pragma warning disable CS1998 + await ReadOutboundAsync(async () => ch.ReadOutbound(), output.Count - readResultBuffer.ReadableBytes, readResultBuffer, TestTimeout, readResultBuffer.ReadableBytes != 0 ? 0 : 1); +#pragma warning restore CS1998 + } + } + int read = Math.Min(output.Count, readResultBuffer.ReadableBytes); + readResultBuffer.ReadBytes(output.Array, output.Offset, read); + return read; + }; + var mediationStream = new MediationStream(readDataFunc, input => + { + Task task = executor.SubmitAsync(() => writeStrategy.WriteToChannelAsync(ch, input)).Unwrap(); + writeTasks.Add(task); + return task; + }, () => + { + ch.CloseAsync(); + }); + + var driverStream = new SslStream(mediationStream, true, (_1, _2, _3, _4) => true); + await Task.Run(() => driverStream.AuthenticateAsServerAsync(tlsCertificate, false, SslProtocols.Tls12, false)) + .WithTimeout(TimeSpan.FromSeconds(10)); + writeTasks.Clear(); + + // -- Handshake complete. Now trigger the bug scenario. -- + // Step 1: Add a pending write (via Pipeline.WriteAsync, NOT WriteOutbound which also flushes) + ch.Pipeline.WriteAsync(Unpooled.WrappedBuffer(new byte[] { 1, 2, 3 })); + + // Step 2: Drain the queue (simulating re-entrant HandleFailure → RemoveAndFailAll) + var queueField = typeof(TlsHandler).GetField("_pendingUnencryptedWrites", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + var queue = (BatchingPendingWriteQueue)queueField.GetValue(tlsHandler); + Assert.False(queue.IsEmpty, "Queue should have a pending write"); + queue.RemoveAndFailAll(new IOException("simulated connection failure")); + Assert.True(queue.IsEmpty, "Queue should be empty after RemoveAndFailAll"); + + // Step 3: Flush → WrapAndFlush → Wrap → Current returns null → loop exits. + // Before fix: If Remove() was called after Current returned non-null + // (due to re-entrant drain between the two calls), NRE would occur. + // After fix: Remove() null is guarded with a break. + // + // In this simplified test, Current returns null because we drained + // the queue before Flush. This exercises the "queue is empty" path + // and verifies the handler doesn't crash. + try + { + ch.Flush(); + } + catch (Exception ex) + { + Assert.False( + ContainsNullReferenceException(ex), + $"NRE from Wrap should not occur: {ex}"); + } + + // Also verify Remove() returns null on an empty queue (the precondition for Fix 2) + Assert.Null(queue.Remove()); + + try + { + ch.CheckException(); + } + catch (Exception ex) + { + Assert.False( + ContainsNullReferenceException(ex), + $"NRE stored in channel: {ex}"); + } + + driverStream.Dispose(); + } + finally + { + await executor.ShutdownGracefullyAsync(TimeSpan.Zero, TimeSpan.Zero); + } + } + + static bool ContainsNullReferenceException(Exception ex) + { + if (ex is NullReferenceException) return true; + if (ex is AggregateException agg) + { + foreach (var inner in agg.Flatten().InnerExceptions) + { + if (inner is NullReferenceException) return true; + } + } + return ex.InnerException is object && ContainsNullReferenceException(ex.InnerException); + } + } } From ef9afd41901fdbe841fbcc98b87d7ee14195e501 Mon Sep 17 00:00:00 2001 From: Maxim Kim Date: Tue, 7 Apr 2026 23:36:25 -0500 Subject: [PATCH 2/2] Improve regression test to fail without Fix 2 Use a custom Stream wrapper (QueueDrainingStreamWrapper) around MediationStream to simulate the re-entrant HandleFailure scenario: after SslStream encrypts data and writes ciphertext to MediationStream (which sets _lastContextWriteTask via FinishWrap), the wrapper drains the pending write queue and clears _lastContextWriteTask. When Wrap continues, Remove() returns null and the unfixed code hits promise.TryComplete() on a null promise, producing NRE. Verified: test FAILS on all 3 TFMs (net452, net471, netcoreapp2.1) without Fix 2, and PASSES with Fix 2 applied. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DotNetty.Handlers.Tests/TlsHandlerTest.cs | 181 ++++++++++++++---- 1 file changed, 143 insertions(+), 38 deletions(-) diff --git a/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs b/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs index 15b6b570e..e8c97b942 100644 --- a/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs +++ b/test/DotNetty.Handlers.Tests/TlsHandlerTest.cs @@ -382,15 +382,16 @@ public override void ChannelActive(IChannelHandlerContext context) /// /// Regression test for https://github.com/maksimkim/SpanNetty/issues/60 - /// Verifies that after the pending write queue is drained (as happens during - /// re-entrant HandleFailure → RemoveAndFailAll in production), the Wrap/Flush - /// code path handles the empty queue gracefully without crashing. + /// Verifies that when the pending write queue is drained re-entrantly during + /// Wrap (between the Current check and Remove call), the null return from + /// Remove() is handled gracefully instead of throwing NullReferenceException. /// - /// Note: The exact re-entrant NRE scenario from issue #60 cannot be reliably - /// reproduced in a unit test because the NRE occurs inside a ContinueWith - /// continuation (silently swallowed by Task) and SslStream.Dispose() during - /// HandleFailure prevents reaching the Remove() call. This test verifies the - /// defensive fix paths are exercised correctly. + /// Uses a custom Stream wrapper around MediationStream to simulate the + /// re-entrant HandleFailure → RemoveAndFailAll scenario: after SslStream + /// encrypts data and writes ciphertext to MediationStream (which sets + /// _lastContextWriteTask via FinishWrap), the wrapper drains the queue and + /// clears _lastContextWriteTask. When Wrap continues, Remove() returns null + /// and the unfixed code hits promise.TryComplete() on a null promise → NRE. /// [Fact] public async Task WrapRemoveNull_ShouldNotThrowNullReferenceException() @@ -404,10 +405,28 @@ public async Task WrapRemoveNull_ShouldNotThrowNullReferenceException() X509Certificate2 tlsCertificate = TestResourceHelper.GetTestCertificate(); string targetHost = tlsCertificate.GetNameInfo(X509NameType.DnsName, false); + // Reflection fields for the re-entrant drain simulation + var queueField = typeof(TlsHandler).GetField("_pendingUnencryptedWrites", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + var lastTaskField = typeof(TlsHandler).GetField("_lastContextWriteTask", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + + // Create a TlsHandler with a custom SslStream that wraps MediationStream + // in a QueueDrainingStreamWrapper. The wrapper intercepts SslStream's + // ciphertext writes and, when enabled, drains the pending write queue + // to simulate re-entrant HandleFailure. + QueueDrainingStreamWrapper streamWrapper = null; TlsHandler tlsHandler = new TlsHandler( - stream => new SslStream(stream, true, (sender, certificate, chain, errors) => true), + stream => + { + streamWrapper = new QueueDrainingStreamWrapper(stream); + return new SslStream(streamWrapper, true, (sender, certificate, chain, errors) => true); + }, new ClientTlsSettings(SslProtocols.Tls12, false, new List(), targetHost)); + // Wire up the reflection targets so the wrapper can drain the queue + streamWrapper.SetTarget(tlsHandler, queueField, lastTaskField); + var ch = new EmbeddedChannel(tlsHandler); // -- Complete the TLS handshake -- @@ -419,7 +438,6 @@ public async Task WrapRemoveNull_ShouldNotThrowNullReferenceException() await Task.WhenAll(writeTasks).WithTimeout(TestTimeout); writeTasks.Clear(); } - if (readResultBuffer.ReadableBytes < output.Count) { if (ch.IsActive) @@ -438,50 +456,34 @@ public async Task WrapRemoveNull_ShouldNotThrowNullReferenceException() Task task = executor.SubmitAsync(() => writeStrategy.WriteToChannelAsync(ch, input)).Unwrap(); writeTasks.Add(task); return task; - }, () => - { - ch.CloseAsync(); - }); + }, () => { ch.CloseAsync(); }); var driverStream = new SslStream(mediationStream, true, (_1, _2, _3, _4) => true); await Task.Run(() => driverStream.AuthenticateAsServerAsync(tlsCertificate, false, SslProtocols.Tls12, false)) .WithTimeout(TimeSpan.FromSeconds(10)); writeTasks.Clear(); - // -- Handshake complete. Now trigger the bug scenario. -- - // Step 1: Add a pending write (via Pipeline.WriteAsync, NOT WriteOutbound which also flushes) - ch.Pipeline.WriteAsync(Unpooled.WrappedBuffer(new byte[] { 1, 2, 3 })); + // -- Handshake complete. Enable the re-entrant drain simulation. -- + streamWrapper.ShouldDrain = true; - // Step 2: Drain the queue (simulating re-entrant HandleFailure → RemoveAndFailAll) - var queueField = typeof(TlsHandler).GetField("_pendingUnencryptedWrites", - System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); - var queue = (BatchingPendingWriteQueue)queueField.GetValue(tlsHandler); - Assert.False(queue.IsEmpty, "Queue should have a pending write"); - queue.RemoveAndFailAll(new IOException("simulated connection failure")); - Assert.True(queue.IsEmpty, "Queue should be empty after RemoveAndFailAll"); - - // Step 3: Flush → WrapAndFlush → Wrap → Current returns null → loop exits. - // Before fix: If Remove() was called after Current returned non-null - // (due to re-entrant drain between the two calls), NRE would occur. - // After fix: Remove() null is guarded with a break. - // - // In this simplified test, Current returns null because we drained - // the queue before Flush. This exercises the "queue is empty" path - // and verifies the handler doesn't crash. + // Write + Flush triggers: TlsHandler.Write (adds to queue) → + // TlsHandler.Flush → WrapAndFlush → Wrap → buf.ReadBytes(_sslStream, ...) → + // SslStream encrypts → wrapper.Write → MediationStream.Write (FinishWrap sets + // _lastContextWriteTask) → wrapper drains queue & clears _lastContextWriteTask → + // back in Wrap: Remove() returns null, _lastContextWriteTask is null → + // Without fix: promise.TryComplete() where promise is null → NRE + // With fix: if (promise is null) { break; } → exits gracefully try { - ch.Flush(); + ch.WriteOutbound(Unpooled.WrappedBuffer(new byte[] { 1, 2, 3 })); } catch (Exception ex) { Assert.False( ContainsNullReferenceException(ex), - $"NRE from Wrap should not occur: {ex}"); + $"NRE from Wrap.Remove() should not occur: {ex}"); } - // Also verify Remove() returns null on an empty queue (the precondition for Fix 2) - Assert.Null(queue.Remove()); - try { ch.CheckException(); @@ -493,6 +495,9 @@ await Task.Run(() => driverStream.AuthenticateAsServerAsync(tlsCertificate, fals $"NRE stored in channel: {ex}"); } + Assert.True(streamWrapper.WasDrained, + "The queue should have been drained during the write"); + driverStream.Dispose(); } finally @@ -514,5 +519,105 @@ static bool ContainsNullReferenceException(Exception ex) return ex.InnerException is object && ContainsNullReferenceException(ex.InnerException); } + /// + /// Wraps MediationStream to simulate re-entrant queue drain during SslStream write. + /// After forwarding the encrypted write to MediationStream (which calls FinishWrap + /// and sets _lastContextWriteTask), it drains the pending write queue and clears + /// _lastContextWriteTask — reproducing the effect of HandleFailure being called + /// re-entrantly during an outbound write. + /// + sealed class QueueDrainingStreamWrapper : Stream + { + readonly Stream _inner; + object _handler; + System.Reflection.FieldInfo _queueField; + System.Reflection.FieldInfo _lastTaskField; + bool _drained; + + public bool ShouldDrain { get; set; } + public bool WasDrained => _drained; + + public QueueDrainingStreamWrapper(Stream inner) { _inner = inner; } + + public void SetTarget(object handler, System.Reflection.FieldInfo queueField, System.Reflection.FieldInfo lastTaskField) + { + _handler = handler; + _queueField = queueField; + _lastTaskField = lastTaskField; + } + + public override void Write(byte[] buffer, int offset, int count) + { + _inner.Write(buffer, offset, count); + DrainIfNeeded(); + } + + private void DrainIfNeeded() + { + if (ShouldDrain && !_drained) + { + _drained = true; + // Clear _lastContextWriteTask so Remove()'s null hits the else branch + // (promise.TryComplete()) instead of the ContinueWith path in LinkOutcome + _lastTaskField.SetValue(_handler, null); + // Drain the queue to make Remove() return null + var queue = (BatchingPendingWriteQueue)_queueField.GetValue(_handler); + queue.RemoveAndFailAll(new IOException("simulated connection failure")); + } + } + + // Required Stream overrides (forward to inner) + public override void Flush() => _inner.Flush(); + public override int Read(byte[] buffer, int offset, int count) => _inner.Read(buffer, offset, count); + public override Task ReadAsync(byte[] buffer, int offset, int count, System.Threading.CancellationToken cancellationToken) => _inner.ReadAsync(buffer, offset, count, cancellationToken); + public override long Seek(long offset, SeekOrigin origin) => _inner.Seek(offset, origin); + public override void SetLength(long value) => _inner.SetLength(value); + public override bool CanRead => _inner.CanRead; + public override bool CanSeek => _inner.CanSeek; + public override bool CanWrite => _inner.CanWrite; + public override long Length => _inner.Length; + public override long Position { get => _inner.Position; set => _inner.Position = value; } + +#if NETCOREAPP || NETSTANDARD_2_0_GREATER + public override System.Threading.Tasks.ValueTask ReadAsync(System.Memory buffer, System.Threading.CancellationToken cancellationToken = default) + => _inner.ReadAsync(buffer, cancellationToken); + + public override void Write(System.ReadOnlySpan buffer) + { + _inner.Write(buffer); + DrainIfNeeded(); + } + + public override System.Threading.Tasks.ValueTask WriteAsync(System.ReadOnlyMemory buffer, System.Threading.CancellationToken cancellationToken = default) + { + var result = _inner.WriteAsync(buffer, cancellationToken); + DrainIfNeeded(); + return result; + } +#endif + + public override Task WriteAsync(byte[] buffer, int offset, int count, System.Threading.CancellationToken cancellationToken) + { + var task = _inner.WriteAsync(buffer, offset, count, cancellationToken); + DrainIfNeeded(); + return task; + } + +#if !NETCOREAPP1_1 + public override IAsyncResult BeginRead(byte[] buffer, int offset, int count, AsyncCallback callback, object state) => _inner.BeginRead(buffer, offset, count, callback, state); + public override int EndRead(IAsyncResult asyncResult) => _inner.EndRead(asyncResult); + public override IAsyncResult BeginWrite(byte[] buffer, int offset, int count, AsyncCallback callback, object state) + { + // On .NET Framework, SslStream.Write uses BeginWrite/EndWrite internally + var result = _inner.BeginWrite(buffer, offset, count, callback, state); + DrainIfNeeded(); + return result; + } + public override void EndWrite(IAsyncResult asyncResult) => _inner.EndWrite(asyncResult); +#endif + + protected override void Dispose(bool disposing) { if (disposing) _inner.Dispose(); base.Dispose(disposing); } + } + } }