Skip to content

Commit cc28668

Browse files
authored
[None][feat] Refactor scaffolding streaming feature and fix openai wo… (#8622)
Signed-off-by: Fred Wei <[email protected]>
1 parent a4f7539 commit cc28668

File tree

18 files changed

+163
-336
lines changed

18 files changed

+163
-336
lines changed

examples/scaffolding/contrib/AsyncGeneration/README.md

Lines changed: 0 additions & 10 deletions
This file was deleted.

examples/scaffolding/contrib/AsyncGeneration/stream_generation_run.py

Lines changed: 0 additions & 104 deletions
This file was deleted.

examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,14 @@ def main():
6565
if args.streaming:
6666

6767
async def task(prompt: str):
68-
i = 0
68+
step = 0
6969
async for result in llm.generate_async(prompt):
70-
i += 1
71-
print(">>>", i, result)
72-
async for output in result.cur_output:
73-
print(">>>", i, len(output.outputs[0].token_ids), "\n",
74-
output.outputs[0].text)
75-
print(f">>> final output {len(result.outputs[0].token_ids)}\n",
76-
result.outputs[0].text)
70+
step += 1
71+
tokens_num = len(
72+
result.outputs[0].token_ids
73+
) if result.outputs[0].token_ids is not None else 0
74+
print(">>>", step, tokens_num, "\n", result.outputs[0].text)
75+
print(f">>> final output {tokens_num}\n", result.outputs[0].text)
7776

7877
# Need to provide LLM's event loop to get results in the middle of the whole process.
7978
asyncio.run_coroutine_threadsafe(task(prompts[0]), llm.loop).result()

examples/scaffolding/contrib/mcp/mcptest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from openai import AsyncOpenAI
55

66
from tensorrt_llm.scaffolding import OpenaiWorker, ScaffoldingLlm
7-
from tensorrt_llm.scaffolding.contrib import (ChatTask, MCPController,
8-
MCPWorker, chat_handler)
7+
from tensorrt_llm.scaffolding.contrib.mcp import (ChatTask, MCPController,
8+
MCPWorker, chat_handler)
99

1010

1111
def parse_arguments():
@@ -28,7 +28,7 @@ def parse_arguments():
2828
from openai import AsyncOpenAI
2929

3030
from tensorrt_llm.scaffolding import OpenaiWorker, ScaffoldingLlm
31-
from tensorrt_llm.scaffolding.contrib import MCPController, MCPWorker
31+
from tensorrt_llm.scaffolding.contrib.mcp import MCPController, MCPWorker
3232

3333

3434
async def main():
@@ -41,7 +41,7 @@ async def main():
4141
]
4242
API_KEY = args.API_KEY
4343
urls = [
44-
"http://0.0.0.0:8080/sse", "http://0.0.0.0:8081/sse",
44+
#"http://0.0.0.0:8080/sse", "http://0.0.0.0:8081/sse",
4545
"http://0.0.0.0:8082/sse"
4646
]
4747
print(f"API_KEY {API_KEY}")
@@ -61,7 +61,7 @@ async def main():
6161

6262
future = llm.generate_async(prompts[0])
6363
result = await future.aresult()
64-
print(f"\nresult is {result.output.output_str}\n")
64+
print(f"\nresult is {result.outputs[0].text}\n")
6565

6666
print(f'main shutting down...')
6767
llm.shutdown()

examples/scaffolding/run_basic_generation.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,12 @@ async def test_async_func(prompt, proposer_worker):
5353
prototype_controller,
5454
{NativeGenerationController.WorkerTag.GENERATION: proposer_worker},
5555
)
56-
i = 0
5756

57+
step = 0
5858
async for result in llm.generate_async(prompt):
59-
i += 1
60-
print(">>>", i, result)
61-
async for output in result.cur_output:
62-
print(">>>", i, len(output.outputs[0].token_ids), "\n",
63-
output.outputs[0].text)
59+
step += 1
60+
print(">>>", step, len(result.outputs[0].token_ids), "\n",
61+
result.outputs[0].text)
6462
print(f">>> final output {len(result.outputs[0].token_ids)}\n",
6563
result.outputs[0].text)
6664

examples/scaffolding/token_budget_majority_vote.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def main():
104104
prompt = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\r\n\r\n"
105105

106106
result = llm.generate(prompt)
107-
extracted_answer = extract_answer_from_boxed(result.output.output_str)
107+
extracted_answer = extract_answer_from_boxed(result.outputs[0].text)
108108
print(f'extracted_answer={extracted_answer}')
109109

110110
llm.shutdown(shutdown_workers=True)

tensorrt_llm/scaffolding/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"GenerationTask",
2626
"StreamGenerationTask",
2727
"RewardTask",
28+
"StreamGenerationTask",
2829
"Worker",
2930
"OpenaiWorker",
3031
"TRTOpenaiWorker",

tensorrt_llm/scaffolding/contrib/AsyncGeneration/README.md

Lines changed: 0 additions & 46 deletions
This file was deleted.

tensorrt_llm/scaffolding/contrib/AsyncGeneration/__init__.py

Lines changed: 0 additions & 3 deletions
This file was deleted.

tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,26 +126,17 @@ def process(self, tasks: List[GenerationTask], **kwargs):
126126
probe_answers[-self.certainty_threshold:])
127127
== self.certainty_threshold
128128
and sum(probe_certain_count) == self.certainty_threshold):
129-
tasks[0].result = probe_task.result
130-
# If the current prompt indicates the chain-of-thought phase has ended, use one type of suffix.
131-
if "</think>" in current_prompt:
132-
tasks[0].output_str = (current_prompt + self.answer_suffix +
133-
probe_answers[-1] + "}\n\\]")
134-
return
135-
else:
136-
# Otherwise, use the suffix with marker to transition clearly.
137-
tasks[0].output_str = (current_prompt +
138-
self.answer_suffix_with_marker +
139-
probe_answers[-1] + "}\n\\]")
140-
return
129+
suffix = self.answer_suffix if "</think>" in current_prompt else self.answer_suffix_with_marker
130+
suffix += probe_answers[-1] + "}\n\\]"
131+
current_prompt += suffix
132+
break
141133

142134
# If not confident, do another round of generation
143135
# Append the newly generated text from the proposer to the current prompt for the next iteration.
144136
current_prompt += proposer_task.output_str
145137

146138
# If the maximum token limit is reached without satisfying the certainty condition,
147139
# output the accumulated prompt as the final output.
148-
tasks[0].result = proposer_task.result
149140
tasks[0].output_str = current_prompt
150141
return
151142

0 commit comments

Comments
 (0)