NVIDIA-NeMo · Pouyanpi · Dec 11, 2025 · Dec 11, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/examples/configs/gs_content_safety/config/config.yml b/examples/configs/gs_content_safety/config/config.yml
@@ -18,5 +18,3 @@ rails:
       enabled: True
       chunk_size: 200
       context_size: 50
-
-streaming: True
diff --git a/examples/configs/llm/hf_pipeline_dolly/config.yml b/examples/configs/llm/hf_pipeline_dolly/config.yml
@@ -2,9 +2,6 @@ models:
   - type: main
     engine: hf_pipeline_dolly
 
-# Remove attribute / set to False if streaming is not required
-streaming: True
-
 instructions:
   - type: general
     content: |

diff --git a/examples/configs/streaming/config.yml b/examples/configs/streaming/config.yml
@@ -13,5 +13,3 @@ rails:
   dialog:
     single_call:
       enabled: True
-
-streaming: True
diff --git a/examples/notebooks/generate_events_and_streaming.ipynb b/examples/notebooks/generate_events_and_streaming.ipynb
@@ -41,15 +41,11 @@
    "metadata": {
     "collapsed": false
    },
-   "source": [
-    "## Step 1: create a config \n",
-    "\n",
-    "Let's create a simple config:"
-   ]
+   "source": "## Step 1: create a config \n\nLet's create a simple config. No special streaming configuration is needed—streaming is automatically enabled when a `StreamingHandler` is used:"
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "d9bac50b3383915e",
    "metadata": {
     "ExecuteTime": {
@@ -59,21 +55,7 @@
     "collapsed": false
    },
    "outputs": [],
-   "source": [
-    "from nemoguardrails import LLMRails, RailsConfig\n",
-    "\n",
-    "YAML_CONFIG = \"\"\"\n",
-    "models:\n",
-    "  - type: main\n",
-    "    engine: openai\n",
-    "    model: gpt-4\n",
-    "\n",
-    "streaming: True\n",
-    "\"\"\"\n",
-    "\n",
-    "config = RailsConfig.from_content(yaml_content=YAML_CONFIG)\n",
-    "app = LLMRails(config)"
-   ]
+   "source": "from nemoguardrails import LLMRails, RailsConfig\n\nYAML_CONFIG = \"\"\"\nmodels:\n  - type: main\n    engine: openai\n    model: gpt-4\n\"\"\"\n\nconfig = RailsConfig.from_content(yaml_content=YAML_CONFIG)\napp = LLMRails(config)"
   },
   {
    "cell_type": "markdown",

diff --git a/examples/scripts/demo_streaming.py b/examples/scripts/demo_streaming.py
@@ -34,8 +34,6 @@
   - type: main
     engine: openai
     model: gpt-4
-
-streaming: True
 """
 
 
@@ -99,8 +97,6 @@ async def demo_streaming_from_custom_action():
                 dialog:
                     user_messages:
                         embeddings_only: True
-
-            streaming: True
         """,
         colang_content="""
             # We need to have at least on canonical form to enable dialog rails.

diff --git a/nemoguardrails/cli/chat.py b/nemoguardrails/cli/chat.py
@@ -28,6 +28,7 @@
 from nemoguardrails.colang.v2_x.runtime.eval import eval_expression
 from nemoguardrails.colang.v2_x.runtime.flows import State
 from nemoguardrails.colang.v2_x.runtime.runtime import RuntimeV2_x
+from nemoguardrails.exceptions import StreamingNotSupportedError
 from nemoguardrails.logging import verbose
 from nemoguardrails.logging.verbose import console
 from nemoguardrails.rails.llm.options import (
@@ -65,11 +66,6 @@ async def _run_chat_v1_0(
             raise RuntimeError("config_path cannot be None when server_url is None")
         rails_config = RailsConfig.from_path(config_path)
         rails_app = LLMRails(rails_config, verbose=verbose)
-        if streaming and not rails_config.streaming_supported:
-            console.print(
-                f"WARNING: The config `{config_path}` does not support streaming. Falling back to normal mode."
-            )
-            streaming = False
     else:
         rails_app = None
 
@@ -83,19 +79,33 @@ async def _run_chat_v1_0(
 
         if not server_url:
             # If we have streaming from a locally loaded config, we initialize the handler.
-            if streaming and not server_url and rails_app and rails_app.main_llm_supports_streaming:
-                bot_message_list = []
-                async for chunk in rails_app.stream_async(messages=history):
-                    if '{"event": "ABORT"' in chunk:
-                        dict_chunk = json.loads(chunk)
-                        console.print("\n\n[red]" + f"ABORT streaming. {dict_chunk['data']}" + "[/]")
-                        break
-
-                    console.print("[green]" + f"{chunk}" + "[/]", end="")
-                    bot_message_list.append(chunk)
-
-                bot_message_text = "".join(bot_message_list)
-                bot_message = {"role": "assistant", "content": bot_message_text}
+            if streaming and not server_url and rails_app:
+                try:
+                    bot_message_list = []
+                    async for chunk in rails_app.stream_async(messages=history):
+                        if '{"event": "ABORT"' in chunk:
+                            dict_chunk = json.loads(chunk)
+                            console.print("\n\n[red]" + f"ABORT streaming. {dict_chunk['data']}" + "[/]")
+                            break
+
+                        console.print("[green]" + f"{chunk}" + "[/]", end="")
+                        bot_message_list.append(chunk)
+
+                    bot_message_text = "".join(bot_message_list)
+                    bot_message = {"role": "assistant", "content": bot_message_text}
+                except StreamingNotSupportedError as e:
+                    raise StreamingNotSupportedError(
+                        f"Cannot use --streaming with config `{config_path}` because output rails "
+                        "are configured but streaming is not enabled for them.\n\n"
+                        "To fix this, either:\n"
+                        "  1. Enable streaming for output rails by adding to your config.yml:\n"
+                        "     rails:\n"
+                        "       output:\n"
+                        "         streaming:\n"
+                        "           enabled: True\n\n"
+                        "  2. Or run without the --streaming flag:\n"
+                        f"     nemoguardrails chat {config_path}"
+                    ) from e
 
             else:
                 if rails_app is None:
@@ -124,7 +134,7 @@ async def _run_chat_v1_0(
                     # String or other fallback case
                     bot_message = {"role": "assistant", "content": str(response)}
 
-                if not streaming or not rails_app.main_llm_supports_streaming:
+                if not streaming:
                     # We print bot messages in green.
                     content = bot_message.get("content", str(bot_message))
                     console.print("[green]" + f"{content}" + "[/]")

diff --git a/nemoguardrails/exceptions.py b/nemoguardrails/exceptions.py
@@ -19,6 +19,7 @@
     "InvalidModelConfigurationError",
     "InvalidRailsConfigurationError",
     "LLMCallException",
+    "StreamingNotSupportedError",
 ]
 
 
@@ -49,6 +50,12 @@ class InvalidRailsConfigurationError(ConfigurationError):
     pass
 
 
+class StreamingNotSupportedError(InvalidRailsConfigurationError):
+    """Raised when streaming is requested but not supported by the configuration."""
+
+    pass
+
+
 class LLMCallException(Exception):
     """A wrapper around the LLM call invocation exception.
 

diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py
@@ -1375,7 +1375,8 @@ class RailsConfig(BaseModel):
 
     streaming: bool = Field(
         default=False,
-        description="Whether this configuration should use streaming mode or not.",
+        deprecated="The 'streaming' field is no longer required. Use stream_async() method directly instead. This field will be removed in a future version.",
+        description="DEPRECATED: Use stream_async() method instead. This field is ignored.",
     )
 
     enable_rails_exceptions: bool = Field(
@@ -1665,20 +1666,6 @@ def parse_object(cls, obj):
 
         return cls.parse_obj(obj)
 
-    @property
-    def streaming_supported(self):
-        """Whether the current config supports streaming or not."""
-
-        if len(self.rails.output.flows) > 0:
-            # if we have output rails streaming enabled
-            # we keep it in case it was needed when we have
-            # support per rails
-            if self.rails.output.streaming and self.rails.output.streaming.enabled:
-                return True
-            return False
-
-        return True
-
     def __add__(self, other):
         """Adds two RailsConfig objects."""
         return _join_rails_configs(self, other)

diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py
@@ -73,6 +73,7 @@
 from nemoguardrails.exceptions import (
     InvalidModelConfigurationError,
     InvalidRailsConfigurationError,
+    StreamingNotSupportedError,
 )
 from nemoguardrails.kb.kb import KnowledgeBase
 from nemoguardrails.llm.cache import CacheInterface, LFUCache
@@ -155,9 +156,6 @@ def __init__(
         #   should be removed
         self.events_history_cache = {}
 
-        # Weather the main LLM supports streaming
-        self.main_llm_supports_streaming = False
-
         # We also load the default flows from the `default_flows.yml` file in the current folder.
         # But only for version 1.0.
         # TODO: decide on the default flows for 2.x.
@@ -377,10 +375,9 @@ def _prepare_model_kwargs(self, model_config):
             if api_key:
                 kwargs["api_key"] = api_key
 
-        # enable streaming token usage when streaming is enabled
+        # enable streaming token usage
         # providers that don't support this parameter will simply ignore it
-        if self.config.streaming:
-            kwargs["stream_usage"] = True
+        kwargs["stream_usage"] = True
 
         return kwargs
 
@@ -398,22 +395,9 @@ def _configure_main_llm_streaming(
             provider_name (Optional[str], optional): Optional provider name for logging.
 
         """
-        if not self.config.streaming:
-            return
 
         if hasattr(llm, "streaming"):
             setattr(llm, "streaming", True)
-            self.main_llm_supports_streaming = True
-        else:
-            self.main_llm_supports_streaming = False
-            if model_name and provider_name:
-                log.warning(
-                    "Model %s from provider %s does not support streaming.",
-                    model_name,
-                    provider_name,
-                )
-            else:
-                log.warning("Provided main LLM does not support streaming.")
 
     def _init_llms(self):
         """
@@ -442,7 +426,6 @@ def _init_llms(self):
                 )
             self.runtime.register_action_param("llm", self.llm)
 
-            self._configure_main_llm_streaming(self.llm)
         else:
             # Otherwise, initialize the main LLM from the config
             main_model = next((model for model in self.config.models if model.type == "main"), None)
@@ -457,11 +440,6 @@ def _init_llms(self):
                 )
                 self.runtime.register_action_param("llm", self.llm)
 
-                self._configure_main_llm_streaming(
-                    self.llm,
-                    model_name=main_model.model,
-                    provider_name=main_model.engine,
-                )
             else:
                 log.warning("No main LLM specified in the config and no LLM provided via constructor.")
 
@@ -848,6 +826,7 @@ async def generate_async(
 
         if streaming_handler:
             streaming_handler_var.set(streaming_handler)
+            self._configure_main_llm_streaming(self.llm)  # type: ignore
 
         # Initialize the object with additional explanation information.
         # We allow this to also be set externally. This is useful when multiple parallel
@@ -1189,7 +1168,7 @@ def _validate_streaming_with_output_rails(self) -> None:
         if len(self.config.rails.output.flows) > 0 and (
             not self.config.rails.output.streaming or not self.config.rails.output.streaming.enabled
         ):
-            raise InvalidRailsConfigurationError(
+            raise StreamingNotSupportedError(
                 "stream_async() cannot be used when output rails are configured but "
                 "rails.output.streaming.enabled is False. Either set "
                 "rails.output.streaming.enabled to True in your configuration, or use "
@@ -1246,6 +1225,8 @@ def stream_async(
 
         streaming_handler = StreamingHandler(include_generation_metadata=include_generation_metadata)
 
+        self._configure_main_llm_streaming(self.llm)  # type: ignore
+
         # Create a properly managed task with exception handling
         async def _generation_task():
             try:
@@ -1357,6 +1338,9 @@ async def generate_events_async(
         llm_stats = LLMStats()
         llm_stats_var.set(llm_stats)
 
+        if streaming_handler_var.get():
+            self._configure_main_llm_streaming(self.llm)  # type: ignore
+
         # Compute the new events.
         processing_log = []
         new_events = await self.runtime.generate_events(events, processing_log=processing_log)

diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
@@ -37,7 +37,6 @@
     GenerationResponse,
 )
 from nemoguardrails.server.datastore.datastore import DataStore
-from nemoguardrails.streaming import StreamingHandler
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
@@ -426,18 +425,11 @@ async def chat_completion(body: RequestBody, request: Request):
             # And prepend them.
             messages = thread_messages + messages
 
-        if body.stream and llm_rails.config.streaming_supported and llm_rails.main_llm_supports_streaming:
-            # Create the streaming handler instance
-            streaming_handler = StreamingHandler()
-
-            # Start the generation
-            asyncio.create_task(
-                llm_rails.generate_async(
-                    messages=messages,
-                    streaming_handler=streaming_handler,
-                    options=body.options,
-                    state=body.state,
-                )
+        if body.stream:
+            streaming_handler = llm_rails.stream_async(
+                messages=messages,
+                options=body.options,
+                state=body.state,
             )
 
             # TODO: Add support for thread_ids in streaming mode