sgl-project
diff --git a/‎_sources/advanced_features/lora.ipynb‎
Lines changed: 161 additions & 156 deletions b/‎_sources/advanced_features/lora.ipynb‎
Lines changed: 161 additions & 156 deletions
diff --git a/‎_sources/advanced_features/separate_reasoning.ipynb‎
Lines changed: 122 additions & 96 deletions b/‎_sources/advanced_features/separate_reasoning.ipynb‎
Lines changed: 122 additions & 96 deletions
diff --git a/‎_sources/advanced_features/speculative_decoding.ipynb‎
Lines changed: 401 additions & 363 deletions b/‎_sources/advanced_features/speculative_decoding.ipynb‎
Lines changed: 401 additions & 363 deletions
diff --git a/‎_sources/advanced_features/structured_outputs.ipynb‎
Lines changed: 132 additions & 139 deletions b/‎_sources/advanced_features/structured_outputs.ipynb‎
Lines changed: 132 additions & 139 deletions
diff --git a/‎_sources/advanced_features/structured_outputs_for_reasoning_models.ipynb‎
Lines changed: 171 additions & 191 deletions b/‎_sources/advanced_features/structured_outputs_for_reasoning_models.ipynb‎
Lines changed: 171 additions & 191 deletions
diff --git a/‎_sources/advanced_features/tool_parser.ipynb‎
Lines changed: 159 additions & 160 deletions b/‎_sources/advanced_features/tool_parser.ipynb‎
Lines changed: 159 additions & 160 deletions
diff --git a/‎_sources/advanced_features/vlm_query.ipynb‎
Lines changed: 230 additions & 237 deletions b/‎_sources/advanced_features/vlm_query.ipynb‎
Lines changed: 230 additions & 237 deletions
diff --git a/‎_sources/basic_usage/native_api.ipynb‎
Lines changed: 154 additions & 160 deletions b/‎_sources/basic_usage/native_api.ipynb‎
Lines changed: 154 additions & 160 deletions
diff --git a/‎_sources/basic_usage/offline_engine_api.ipynb‎
Lines changed: 468 additions & 508 deletions b/‎_sources/basic_usage/offline_engine_api.ipynb‎
Lines changed: 468 additions & 508 deletions
diff --git a/‎_sources/basic_usage/openai_api_completions.ipynb‎
Lines changed: 64 additions & 46 deletions b/‎_sources/basic_usage/openai_api_completions.ipynb‎
Lines changed: 64 additions & 46 deletions
@@ -31,10 +31,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:09:58.606964Z",
-     "iopub.status.busy": "2025-10-06T02:09:58.606839Z",
-     "iopub.status.idle": "2025-10-06T02:10:37.396079Z",
-     "shell.execute_reply": "2025-10-06T02:10:37.395176Z"
+     "iopub.execute_input": "2025-10-06T03:00:32.007007Z",
+     "iopub.status.busy": "2025-10-06T03:00:32.006876Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.019849Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.019316Z"
     }
    },
    "outputs": [
@@ -76,7 +76,13 @@
      "output_type": "stream",
      "text": [
       "/usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.\n",
-      "  import pynvml  # type: ignore[import]\n",
+      "  import pynvml  # type: ignore[import]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "/usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.\n",
       "  import pynvml  # type: ignore[import]\n"
      ]
@@ -86,7 +92,7 @@
      "output_type": "stream",
      "text": [
       "`torch_dtype` is deprecated! Use `dtype` instead!\n",
-      "[2025-10-06 02:10:25] `torch_dtype` is deprecated! Use `dtype` instead!\n"
+      "[2025-10-06 03:01:01] `torch_dtype` is deprecated! Use `dtype` instead!\n"
      ]
     },
     {
@@ -96,20 +102,32 @@
       "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
       "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
       "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
-      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
-      "[2025-10-06 02:10:27] MOE_RUNNER_BACKEND is not initialized, using triton backend\n"
+      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-10-06 03:01:02] MOE_RUNNER_BACKEND is not initialized, using triton backend\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.47it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.20it/s]\n",
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.46it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.20it/s]\n",
       "\n"
      ]
     },
@@ -119,18 +137,18 @@
      "text": [
       "\r",
       "  0%|          | 0/3 [00:00<?, ?it/s]\r",
-      "Capturing batches (bs=4 avail_mem=67.93 GB):   0%|          | 0/3 [00:00<?, ?it/s]"
+      "Capturing batches (bs=4 avail_mem=77.03 GB):   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\r",
-      "Capturing batches (bs=4 avail_mem=67.93 GB):  33%|███▎      | 1/3 [00:00<00:00,  3.74it/s]\r",
-      "Capturing batches (bs=2 avail_mem=67.87 GB):  33%|███▎      | 1/3 [00:00<00:00,  3.74it/s]\r",
-      "Capturing batches (bs=1 avail_mem=67.86 GB):  33%|███▎      | 1/3 [00:00<00:00,  3.74it/s]\r",
-      "Capturing batches (bs=1 avail_mem=67.86 GB): 100%|██████████| 3/3 [00:00<00:00,  8.74it/s]\n"
+      "Capturing batches (bs=4 avail_mem=77.03 GB):  33%|███▎      | 1/3 [00:00<00:01,  1.53it/s]\r",
+      "Capturing batches (bs=2 avail_mem=76.97 GB):  33%|███▎      | 1/3 [00:00<00:01,  1.53it/s]\r",
+      "Capturing batches (bs=1 avail_mem=76.96 GB):  33%|███▎      | 1/3 [00:00<00:01,  1.53it/s]\r",
+      "Capturing batches (bs=1 avail_mem=76.96 GB): 100%|██████████| 3/3 [00:00<00:00,  4.17it/s]\n"
      ]
     },
     {
@@ -149,7 +167,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Server started on http://localhost:39399\n"
+      "Server started on http://localhost:31747\n"
      ]
     }
    ],
@@ -183,17 +201,17 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:37.398656Z",
-     "iopub.status.busy": "2025-10-06T02:10:37.398258Z",
-     "iopub.status.idle": "2025-10-06T02:10:37.661630Z",
-     "shell.execute_reply": "2025-10-06T02:10:37.661051Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.024173Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.023033Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.307561Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.307079Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='375f25746b1e45be92cdf750444e14b7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Sure, here are three countries and their respective capitals:\\n\\n1. **United States** - Washington, D.C.\\n2. **Canada** - Ottawa\\n3. **Australia** - Canberra', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), matched_stop=151645)], created=1759716637, model='qwen/qwen2.5-0.5b-instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=37, total_tokens=76, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='22b3d25954cb404ca70949629f098753', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Sure, here are three countries and their respective capitals:\\n\\n1. **United States** - Washington, D.C.\\n2. **Canada** - Ottawa\\n3. **Australia** - Canberra', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), matched_stop=151645)], created=1759719673, model='qwen/qwen2.5-0.5b-instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=37, total_tokens=76, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -383,17 +401,17 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:37.663358Z",
-     "iopub.status.busy": "2025-10-06T02:10:37.663203Z",
-     "iopub.status.idle": "2025-10-06T02:10:37.885552Z",
-     "shell.execute_reply": "2025-10-06T02:10:37.884986Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.309570Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.309423Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.543919Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.543474Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>The major achievements of ancient Rome include the construction of the Colosseum, the Pantheon, and the aqueducts that brought water to the city. They also developed a complex system of governance, including the Roman Republic and the Roman Empire. The Romans were known for their military prowess, particularly in their campaigns against the Carthaginians and the Gauls. They are also famous for their art, architecture, and philosophy, which influenced later cultures.</strong>"
+       "<strong style='color: #00008B;'>The ancient Romans made significant contributions to various fields, including law, philosophy, science, and literature. They were known for their engineering skills, particularly in the construction of monumental architecture like the Colosseum and the Pantheon. The Roman Empire was one of the largest and most powerful empires in history, spanning across Europe, Asia Minor, and North Africa. Their cultural achievements included the development of Roman law and the spread of Christianity. Additionally, they were skilled in agriculture and had a sophisticated system of governance.</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -442,18 +460,18 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:37.886984Z",
-     "iopub.status.busy": "2025-10-06T02:10:37.886831Z",
-     "iopub.status.idle": "2025-10-06T02:10:37.997297Z",
-     "shell.execute_reply": "2025-10-06T02:10:37.996755Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.545307Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.545169Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.655547Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.654789Z"
     }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I apologize, but I need more context to understand what you mean by \"a test.\" Could you please provide more details or clarify your question? Whether it's a specific technical question, a question about a specific language, or something else, I could help you with the information you need."
+      "Yes, \"test\" is a code word for a survey or a sample in various contexts. It's common to use \"test\" to refer to a small group of people or a particular population whose opinions or behaviors will be surveyed in a testing or evaluation activity."
      ]
     }
    ],
@@ -483,17 +501,17 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:37.998642Z",
-     "iopub.status.busy": "2025-10-06T02:10:37.998497Z",
-     "iopub.status.idle": "2025-10-06T02:10:38.148945Z",
-     "shell.execute_reply": "2025-10-06T02:10:38.148459Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.657280Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.656894Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.822803Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.822237Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='b15c911e50ec48bb915fc8a641dd052c', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. United States - Washington D.C.\\n2. Canada - Ottawa\\n3. France - Paris\\n4. Germany - Berlin\\n5. Japan - Tokyo\\n6. Italy - Rome\\n7. Spain - Madrid\\n8. United Kingdom - London\\n9. Australia - Canberra\\n10. New Zealand', matched_stop=None)], created=1759716638, model='qwen/qwen2.5-0.5b-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=8, total_tokens=72, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='0a9bef63fc134f77b60a8501ba5d912d', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. United States - Washington D.C.\\n2. Canada - Ottawa\\n3. France - Paris\\n4. Germany - Berlin\\n5. Japan - Tokyo\\n6. Italy - Rome\\n7. Spain - Madrid\\n8. United Kingdom - London\\n9. Australia - Canberra\\n10. New Zealand', matched_stop=None)], created=1759719673, model='qwen/qwen2.5-0.5b-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=8, total_tokens=72, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -532,17 +550,17 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:38.150455Z",
-     "iopub.status.busy": "2025-10-06T02:10:38.150314Z",
-     "iopub.status.idle": "2025-10-06T02:10:38.246197Z",
-     "shell.execute_reply": "2025-10-06T02:10:38.245724Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.824422Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.824267Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.902031Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.901479Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='addc2fde8786458da18724887320bc9c', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' Once upon a time, there was a space explorer named Captain Alex. He had always dreamed of exploring the stars and uncovering the mysteries of the universe. One day, he received an invitation to join a team of scientists on a mission to explore a new planet.', matched_stop='\\n\\n')], created=1759716638, model='qwen/qwen2.5-0.5b-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=53, prompt_tokens=9, total_tokens=62, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='b96d15fb328c43c8afb3db6bf8ab7023', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As an AI language model, I do not have personal experiences or emotions, but I can generate a fictional story based on common themes and events that people might encounter in space travel.', matched_stop='\\n\\n')], created=1759719673, model='qwen/qwen2.5-0.5b-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=36, prompt_tokens=9, total_tokens=45, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -583,10 +601,10 @@
    "execution_count": 7,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-10-06T02:10:38.247645Z",
-     "iopub.status.busy": "2025-10-06T02:10:38.247506Z",
-     "iopub.status.idle": "2025-10-06T02:10:38.296314Z",
-     "shell.execute_reply": "2025-10-06T02:10:38.294678Z"
+     "iopub.execute_input": "2025-10-06T03:01:13.903616Z",
+     "iopub.status.busy": "2025-10-06T03:01:13.903457Z",
+     "iopub.status.idle": "2025-10-06T03:01:13.943927Z",
+     "shell.execute_reply": "2025-10-06T03:01:13.943243Z"
     }
    },
    "outputs": [],