NVIDIA
diff --git a/‎examples/scaffolding/contrib/AsyncGeneration/README.md‎
Lines changed: 0 additions & 10 deletions b/‎examples/scaffolding/contrib/AsyncGeneration/README.md‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎examples/scaffolding/contrib/AsyncGeneration/stream_generation_run.py‎
Lines changed: 0 additions & 104 deletions b/‎examples/scaffolding/contrib/AsyncGeneration/stream_generation_run.py‎
Lines changed: 0 additions & 104 deletions
diff --git a/‎examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py‎
Lines changed: 7 additions & 8 deletions b/‎examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/scaffolding/contrib/mcp/mcptest.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/scaffolding/contrib/mcp/mcptest.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/scaffolding/run_basic_generation.py‎
Lines changed: 4 additions & 6 deletions b/‎examples/scaffolding/run_basic_generation.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/scaffolding/token_budget_majority_vote.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/scaffolding/token_budget_majority_vote.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/scaffolding/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/scaffolding/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/scaffolding/contrib/AsyncGeneration/README.md‎
Lines changed: 0 additions & 46 deletions b/‎tensorrt_llm/scaffolding/contrib/AsyncGeneration/README.md‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎tensorrt_llm/scaffolding/contrib/AsyncGeneration/__init__.py‎
Lines changed: 0 additions & 3 deletions b/‎tensorrt_llm/scaffolding/contrib/AsyncGeneration/__init__.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py‎
Lines changed: 4 additions & 13 deletions b/‎tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py‎
Lines changed: 4 additions & 13 deletions
@@ -65,15 +65,14 @@ def main():
     if args.streaming:
 
         async def task(prompt: str):
-            i = 0
+            step = 0
             async for result in llm.generate_async(prompt):
-                i += 1
-                print(">>>", i, result)
-                async for output in result.cur_output:
-                    print(">>>", i, len(output.outputs[0].token_ids), "\n",
-                          output.outputs[0].text)
-            print(f">>> final output {len(result.outputs[0].token_ids)}\n",
-                  result.outputs[0].text)
+                step += 1
+                tokens_num = len(
+                    result.outputs[0].token_ids
+                ) if result.outputs[0].token_ids is not None else 0
+                print(">>>", step, tokens_num, "\n", result.outputs[0].text)
+            print(f">>> final output {tokens_num}\n", result.outputs[0].text)
 
         # Need to provide LLM's event loop to get results in the middle of the whole process.
         asyncio.run_coroutine_threadsafe(task(prompts[0]), llm.loop).result()
 
@@ -4,8 +4,8 @@
 from openai import AsyncOpenAI
 
 from tensorrt_llm.scaffolding import OpenaiWorker, ScaffoldingLlm
-from tensorrt_llm.scaffolding.contrib import (ChatTask, MCPController,
-                                              MCPWorker, chat_handler)
+from tensorrt_llm.scaffolding.contrib.mcp import (ChatTask, MCPController,
+                                                  MCPWorker, chat_handler)
 
 
 def parse_arguments():
@@ -28,7 +28,7 @@ def parse_arguments():
 from openai import AsyncOpenAI
 
 from tensorrt_llm.scaffolding import OpenaiWorker, ScaffoldingLlm
-from tensorrt_llm.scaffolding.contrib import MCPController, MCPWorker
+from tensorrt_llm.scaffolding.contrib.mcp import MCPController, MCPWorker
 
 
 async def main():
@@ -41,7 +41,7 @@ async def main():
     ]
     API_KEY = args.API_KEY
     urls = [
-        "http://0.0.0.0:8080/sse", "http://0.0.0.0:8081/sse",
+        #"http://0.0.0.0:8080/sse", "http://0.0.0.0:8081/sse",
         "http://0.0.0.0:8082/sse"
     ]
     print(f"API_KEY {API_KEY}")
@@ -61,7 +61,7 @@ async def main():
 
     future = llm.generate_async(prompts[0])
     result = await future.aresult()
-    print(f"\nresult is {result.output.output_str}\n")
+    print(f"\nresult is {result.outputs[0].text}\n")
 
     print(f'main shutting down...')
     llm.shutdown()
 
@@ -53,14 +53,12 @@ async def test_async_func(prompt, proposer_worker):
             prototype_controller,
             {NativeGenerationController.WorkerTag.GENERATION: proposer_worker},
         )
-        i = 0
 
+        step = 0
         async for result in llm.generate_async(prompt):
-            i += 1
-            print(">>>", i, result)
-            async for output in result.cur_output:
-                print(">>>", i, len(output.outputs[0].token_ids), "\n",
-                      output.outputs[0].text)
+            step += 1
+            print(">>>", step, len(result.outputs[0].token_ids), "\n",
+                  result.outputs[0].text)
         print(f">>> final output {len(result.outputs[0].token_ids)}\n",
               result.outputs[0].text)
 
 
@@ -104,7 +104,7 @@ def main():
     prompt = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\r\n\r\n"
 
     result = llm.generate(prompt)
-    extracted_answer = extract_answer_from_boxed(result.output.output_str)
+    extracted_answer = extract_answer_from_boxed(result.outputs[0].text)
     print(f'extracted_answer={extracted_answer}')
 
     llm.shutdown(shutdown_workers=True)
 
@@ -25,6 +25,7 @@
     "GenerationTask",
     "StreamGenerationTask",
     "RewardTask",
+    "StreamGenerationTask",
     "Worker",
     "OpenaiWorker",
     "TRTOpenaiWorker",
 
@@ -126,26 +126,17 @@ def process(self, tasks: List[GenerationTask], **kwargs):
                         probe_answers[-self.certainty_threshold:])
                     == self.certainty_threshold
                     and sum(probe_certain_count) == self.certainty_threshold):
-                tasks[0].result = probe_task.result
-                # If the current prompt indicates the chain-of-thought phase has ended, use one type of suffix.
-                if "</think>" in current_prompt:
-                    tasks[0].output_str = (current_prompt + self.answer_suffix +
-                                           probe_answers[-1] + "}\n\\]")
-                    return
-                else:
-                    # Otherwise, use the suffix with marker to transition clearly.
-                    tasks[0].output_str = (current_prompt +
-                                           self.answer_suffix_with_marker +
-                                           probe_answers[-1] + "}\n\\]")
-                    return
+                suffix = self.answer_suffix if "</think>" in current_prompt else self.answer_suffix_with_marker
+                suffix += probe_answers[-1] + "}\n\\]"
+                current_prompt += suffix
+                break
 
             # If not confident, do another round of generation
             # Append the newly generated text from the proposer to the current prompt for the next iteration.
             current_prompt += proposer_task.output_str
 
         # If the maximum token limit is reached without satisfying the certainty condition,
         # output the accumulated prompt as the final output.
-        tasks[0].result = proposer_task.result
         tasks[0].output_str = current_prompt
         return