NVIDIA-NeMo · przemekboruta · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · greptile-apps
@@ -83,6 +83,11 @@ def __init__(
     def concurrency_mode(self) -> ClientConcurrencyMode:
         return self._mode
 
+    @property
+    def limits(self) -> httpx.Limits:
+        """Connection pool limits derived from ``max_parallel_requests`` at construction time."""
+        return self._limits
+
     @abstractmethod
     def _build_headers(self, extra_headers: dict[str, str]) -> dict[str, str]:
         """Build provider-specific request headers."""
@@ -97,10 +102,12 @@ def _get_sync_client(self) -> httpx.Client:
                 raise RuntimeError("Model client is closed.")
             if self._client is None:
                 if self._transport is None:
-                    self._transport = create_retry_transport(self._retry_config, strip_rate_limit_codes=False)
+                    inner = lazy.httpx.HTTPTransport(limits=self._limits)
+                    self._transport = create_retry_transport(
+                        self._retry_config, strip_rate_limit_codes=False, transport=inner
+                    )
                 self._client = lazy.httpx.Client(
                     transport=self._transport,
-                    limits=self._limits,
                     timeout=lazy.httpx.Timeout(self._timeout_s),
                 )
             return self._client
@@ -113,10 +120,12 @@ def _get_async_client(self) -> httpx.AsyncClient:
                 raise RuntimeError("Model client is closed.")
             if self._aclient is None:
                 if self._transport is None:
-                    self._transport = create_retry_transport(self._retry_config, strip_rate_limit_codes=True)
+                    inner = lazy.httpx.AsyncHTTPTransport(limits=self._limits)
+                    self._transport = create_retry_transport(
+                        self._retry_config, strip_rate_limit_codes=True, transport=inner
+                    )
                 self._aclient = lazy.httpx.AsyncClient(
                     transport=self._transport,
-                    limits=self._limits,
                     timeout=lazy.httpx.Timeout(self._timeout_s),
                 )
             return self._aclient

@@ -5,9 +5,13 @@
 
 import logging
 from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
 
 from httpx_retries import Retry, RetryTransport
 
+if TYPE_CHECKING:
+    import httpx
+
 logger = logging.getLogger(__name__)
 
 # 429 must not be retried at the transport layer so that rate-limit signals
@@ -37,6 +41,7 @@ def create_retry_transport(
     config: RetryConfig | None = None,
     *,
     strip_rate_limit_codes: bool = True,
+    transport: httpx.BaseTransport | httpx.AsyncBaseTransport | None = None,
 ) -> RetryTransport:
     """Build an httpx ``RetryTransport`` from a :class:`RetryConfig`.
 
@@ -51,6 +56,12 @@ def create_retry_transport(
             AIMD feedback loop.  When ``False`` (used by the sync engine, which has
             no salvage queue), 429 is kept in the retry list so the transport layer
             retries it transparently.
+        transport: Optional pre-configured transport to pass directly to
+            ``RetryTransport``.  Pass ``httpx.HTTPTransport`` for sync clients or
+            ``httpx.AsyncHTTPTransport`` for async clients — typically with a custom
+            ``limits=`` — so that the connection pool is sized correctly.  When
+            ``None`` (default), ``RetryTransport`` creates its own default pools for
+            both sync and async requests.
     """
     cfg = config or RetryConfig()
     status_codes = cfg.retryable_status_codes
@@ -72,4 +83,4 @@ def create_retry_transport(
         respect_retry_after_header=True,
         allowed_methods=Retry.RETRYABLE_METHODS | frozenset(["POST"]),
     )
-    return RetryTransport(retry=retry)
+    return RetryTransport(transport=transport, retry=retry)
@@ -289,3 +289,23 @@ async def test_acompletion_lazy_initializes_async_client(
 
     mock_ctor.assert_called_once()
     assert result.message.content == "lazy result"
+
+
+# ---------------------------------------------------------------------------
+# Connection pool size regression tests (issue #459)
+# ---------------------------------------------------------------------------
+
+
+def test_client_limits_respect_max_parallel_requests() -> None:
+    """Connection pool limits must reflect max_parallel_requests (regression for issue #459).
+
+    pool_max = max(32, 2 * max_parallel_requests) = max(32, 600) = 600
+    """
+    client = OpenAICompatibleClient(
+        provider_name=_OPENAI_PROVIDER,
+        endpoint=_OPENAI_ENDPOINT,
+        api_key="sk-test",
+        max_parallel_requests=300,
+        concurrency_mode=ClientConcurrencyMode.SYNC,
+    )
+    assert client.limits.max_connections == 600