From 50f5adcc8abc0002af39228fedfc35ab68be1d82 Mon Sep 17 00:00:00 2001
From: Jhin <47354855+jhinpan@users.noreply.github.com>
Date: Sun, 2 Feb 2025 13:31:10 -0600
Subject: [PATCH 1/4] Feature: Add SGLang Support to backend

---
 Makefile                        |  1 +
 README.md                       | 63 ++++++++++++++++++++++-----------
 setup.py                        |  1 +
 slurm/evaluate.slurm            |  9 ++---
 src/open_r1/generate.py         | 50 ++++++++++++++++++++++++--
 src/open_r1/utils/evaluation.py | 18 ++++++++--
 6 files changed, 113 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index 1140e59e..5be6765f 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,7 @@ quality:
 # Evaluation
 
 evaluate:
+	$(eval BACKEND := $(if $(BACKEND),$(BACKEND),vllm))
 	$(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \
 		if [ "$(PARALLEL)" = "data" ]; then \
 			echo "data_parallel_size=$(NUM_GPUS)"; \
diff --git a/README.md b/README.md
index ebb725e2..d71a6cea 100644
--- a/README.md
+++ b/README.md
@@ -2,18 +2,18 @@
 
 *A fully open reproduction of DeepSeek-R1. This repo is a work in progress, let's build it together!*
 
-**Table of Contents**  
-1. [Overview](#overview)  
-2. [Plan of attack](#plan-of-attack)  
-3. [Installation](#installation)  
-4. [Training models](#training-models)  
-   - [SFT](#sft)  
-   - [GRPO](#grpo)  
-5. [Evaluating models](#evaluating-models)  
-6. [Reproducing Deepseek's evaluation results on MATH-500](#reproducing-deepseeks-evaluation-results-on-math-500)  
-7. [Data generation](#data-generation)  
-   - [Generate data from a smol distilled R1 model](#generate-data-from-a-smol-distilled-r1-model)  
-   - [Generate data from DeepSeek-R1](#generate-data-from-deepseek-r1)  
+**Table of Contents**
+1. [Overview](#overview)
+2. [Plan of attack](#plan-of-attack)
+3. [Installation](#installation)
+4. [Training models](#training-models)
+   - [SFT](#sft)
+   - [GRPO](#grpo)
+5. [Evaluating models](#evaluating-models)
+6. [Reproducing Deepseek's evaluation results on MATH-500](#reproducing-deepseeks-evaluation-results-on-math-500)
+7. [Data generation](#data-generation)
+   - [Generate data from a smol distilled R1 model](#generate-data-from-a-smol-distilled-r1-model)
+   - [Generate data from DeepSeek-R1](#generate-data-from-deepseek-r1)
 8. [Contributing](#contributing)
 
 ## Overview
@@ -45,6 +45,7 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee
 
 **Note: Libraries rely on CUDA 12.1. Double check your system if you get segmentation faults.**
 
+
 To run the code in this project, first, create a Python virtual environment using e.g. `uv`.
 To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/).
 
@@ -63,7 +64,19 @@ pip install vllm>=0.7.0 --extra-index-url https://download.pytorch.org/whl/cu121
 export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] + '/nvidia/nvjitlink/lib')"):$LD_LIBRARY_PATH
 ```
 
-This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
+This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it.
+
+Alternatively, you can use *SGLang* as a replacement for *vLLM*:
+
+```shell
+pip install --upgrade pip
+pip install sgl-kernel --force-reinstall --no-deps
+pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
+```
+
+**Note:** When using SGLang, make sure to check the [FlashInfer installation doc](https://github.com/flashinfer-ai/flashinfer) to install the proper version according to your PyTorch and CUDA versions.
+
+You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
 
 ```shell
 pip install -e ".[dev]"
@@ -109,7 +122,7 @@ To launch a Slurm job, run:
 sbatch --output=/path/to/logs/%x-%j.out --err=/path/to/logs/%x-%j.err slurm/sft.slurm {model} {dataset} {accelerator}
 ```
 
-Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging Face Hub, while `{accelerator}` refers to the choice of an 🤗 Accelerate config file in configs. 
+Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging Face Hub, while `{accelerator}` refers to the choice of an 🤗 Accelerate config file in configs.
 
 ### GRPO
 
@@ -141,7 +154,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
     --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
-    --output-dir $OUTPUT_DIR 
+    --output-dir $OUTPUT_DIR
 ```
 
 To increase throughput across multiple GPUs, use _data parallel_ as follows:
@@ -157,7 +170,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
     --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
-    --output-dir $OUTPUT_DIR 
+    --output-dir $OUTPUT_DIR
 ```
 
 For large models which require sharding across GPUs, use _tensor parallel_ and run:
@@ -174,7 +187,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
     --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
-    --output-dir $OUTPUT_DIR 
+    --output-dir $OUTPUT_DIR
 ```
 
 You can also launch an evaluation with `make evaluate`, specifying the model, task, and optionally the parallelism technique and number of GPUs.
@@ -193,6 +206,16 @@ To use Tensor Parallelism:
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8
 ```
+
+To use SGLang instead of vLLM, you would then run the commands with the --backend sglang flag or BACKEND=sglang for make commands. For example:
+
+To evaluate on a single GPU:
+```shell
+make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BackEND=sglang
+```
+
+
+
 ## Reproducing Deepseek's evaluation results on MATH-500
 We are able to reproduce Deepseek's reported results on the MATH-500 Benchmark:
 | Model                      | MATH-500 (HF lighteval) | MATH-500 (DeepSeek Reported) |
@@ -222,7 +245,7 @@ sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-70B math_500 t
 
 ### Generate data from a smol distilled R1 model
 
-The following example can be run in 1xH100. 
+The following example can be run in 1xH100.
 First install the following dependencies:
 
 ```shell
@@ -265,7 +288,7 @@ with Pipeline(
     )
     prompt_column = "problem"
     text_generation = TextGeneration(
-        llm=llm, 
+        llm=llm,
         template=prompt_template,
         num_generations=4,
         input_mappings={"instruction": prompt_column} if prompt_column is not None else {}
@@ -302,7 +325,7 @@ sbatch slurm/generate.slurm \
     --hf-output-dataset username/r1-dataset
 ```
 
-> [!NOTE]  
+> [!NOTE]
 > While the job is running, you can setup an SSH tunnel through the cluster login node to access the Ray dashboard from your computer running `ssh -L 8265:ray_ip_head_node:8265 <login_node>`, then browsing `http://localhost:8265`
 
 ## Contributing
diff --git a/setup.py b/setup.py
index b3b10694..662fea28 100644
--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,7 @@
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
     "vllm>=0.7.0",
+    "sglang>=0.4.0",
     "wandb>=0.19.1",
 ]
 
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 5fe7f8e3..2da6a8a5 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -4,7 +4,7 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod 
+#SBATCH --partition=hopper-prod
 #SBATCH --time=01:59:00
 #SBATCH --output=./logs/evaluate/%x-%j.out
 #SBATCH --err=./logs/evaluate/%x-%j.err
@@ -44,12 +44,13 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # Be ye warned this may not work on other clusters!
 module load cuda/12.1
 
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
+BACKEND=${BACKEND:-vllm}  # Default to vLLM if not specified
+
+lighteval $BACKEND $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
     --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
     --save-details \
-    --output-dir $OUTPUT_DIR 
-
+    --output-dir $OUTPUT_DIR
 
 echo "END TIME: $(date)"
diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py
index 40ff3b39..4e4c74f6 100644
--- a/src/open_r1/generate.py
+++ b/src/open_r1/generate.py
@@ -18,10 +18,46 @@
 from distilabel.pipeline import Pipeline
 from distilabel.steps import StepResources
 from distilabel.steps.tasks import TextGeneration
+from typing import Union
+from sglang import RuntimeClient
+
+
+class LLMBackend:
+    def __init__(self, backend: str = "vllm"):
+        self.backend = backend
+
+    def get_llm(
+        self,
+        model: str,
+        base_url: str = "http://localhost:8000/v1",
+        timeout: int = 900,
+        max_retries: int = 0,
+        generation_kwargs: dict = None
+    ) -> Union[OpenAILLM, RuntimeClient]:
+        if self.backend == "vllm":
+            return OpenAILLM(
+                base_url=base_url,
+                api_key="something",
+                model=model,
+                timeout=timeout,
+                max_retries=max_retries,
+                generation_kwargs=generation_kwargs,
+            )
+        elif self.backend == "sglang":
+            return RuntimeClient(
+                model=model,
+                api_base=base_url,
+                timeout=timeout,
+                max_retries=max_retries,
+                **generation_kwargs
+            )
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
 
 
 def build_distilabel_pipeline(
     model: str,
+    backend: str = "vllm",  # Add backend parameter
     base_url: str = "http://localhost:8000/v1",
     prompt_column: Optional[str] = None,
     prompt_template: str = "{{ instruction }}",
@@ -42,12 +78,13 @@ def build_distilabel_pipeline(
     if top_p is not None:
         generation_kwargs["top_p"] = top_p
 
+    llm_backend = LLMBackend(backend=backend)
+
     with Pipeline().ray() as pipeline:
         TextGeneration(
-            llm=OpenAILLM(
+            llm=llm_backend.get_llm(
+                model,
                 base_url=base_url,
-                api_key="something",
-                model=model,
                 timeout=timeout,
                 max_retries=retries,
                 generation_kwargs=generation_kwargs,
@@ -167,6 +204,13 @@ def build_distilabel_pipeline(
         action="store_true",
         help="Whether to make the output dataset private when pushing to HF Hub",
     )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=["vllm", "sglang"],
+        help="Backend to use for generation (vllm or sglang)",
+    )
 
     args = parser.parse_args()
 
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 9cbac82d..2fe53372 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -50,8 +50,22 @@ def register_lighteval_task(
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
 
 
-def get_lighteval_tasks():
-    return list(LIGHTEVAL_TASKS.keys())
+def get_lighteval_tasks(backend: str = "vllm"):
+    """Get lighteval tasks with specified backend configuration.
+    
+    Args:
+        backend (str, optional): Backend to use for evaluation. Either "vllm" or "sglang". Defaults to "vllm".
+    
+    Returns:
+        List[str]: List of available task names
+    """
+    tasks = LIGHTEVAL_TASKS.copy()
+    if backend == "sglang":
+        # Modify task configurations for SGLang backend
+        for task_name, task_config in tasks.items():
+            # Add SGLang specific configuration while preserving the task definition
+            tasks[task_name] = f"custom|{task_config}|sglang"
+    return list(tasks.keys())
 
 
 SUPPORTED_BENCHMARKS = get_lighteval_tasks()

From 2210298fd28f6983e3861cc048acd0a74f94f20b Mon Sep 17 00:00:00 2001
From: Jhin <47354855+jhinpan@users.noreply.github.com>
Date: Mon, 3 Feb 2025 13:06:03 -0600
Subject: [PATCH 2/4] Some Doc Style and Coding Style Fix

---
 README.md                       | 6 +++---
 setup.py                        | 2 +-
 src/open_r1/generate.py         | 2 +-
 src/open_r1/utils/evaluation.py | 7 +++++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d71a6cea..e155a499 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0]
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it.
 
-Alternatively, you can use *SGLang* as a replacement for *vLLM*:
+Alternatively, you can use *SGLang* as inference backend.
 
 ```shell
 pip install --upgrade pip
@@ -207,11 +207,11 @@ To use Tensor Parallelism:
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8
 ```
 
-To use SGLang instead of vLLM, you would then run the commands with the --backend sglang flag or BACKEND=sglang for make commands. For example:
+To use SGLang, you would then run the commands with the `--backend sglang` flag or `BACKEND=sglang` for make commands. For example:
 
 To evaluate on a single GPU:
 ```shell
-make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BackEND=sglang
+make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BACKEND=sglang
 ```
 
 
diff --git a/setup.py b/setup.py
index 662fea28..b8552983 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
     "vllm>=0.7.0",
-    "sglang>=0.4.0",
+    "sglang>=0.4.1",
     "wandb>=0.19.1",
 ]
 
diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py
index 4e4c74f6..9af96868 100644
--- a/src/open_r1/generate.py
+++ b/src/open_r1/generate.py
@@ -57,7 +57,7 @@ def get_llm(
 
 def build_distilabel_pipeline(
     model: str,
-    backend: str = "vllm",  # Add backend parameter
+    backend: str = "vllm",
     base_url: str = "http://localhost:8000/v1",
     prompt_column: Optional[str] = None,
     prompt_template: str = "{{ instruction }}",
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 2fe53372..29e815af 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -52,10 +52,10 @@ def register_lighteval_task(
 
 def get_lighteval_tasks(backend: str = "vllm"):
     """Get lighteval tasks with specified backend configuration.
-    
+
     Args:
         backend (str, optional): Backend to use for evaluation. Either "vllm" or "sglang". Defaults to "vllm".
-    
+
     Returns:
         List[str]: List of available task names
     """
@@ -65,6 +65,9 @@ def get_lighteval_tasks(backend: str = "vllm"):
         for task_name, task_config in tasks.items():
             # Add SGLang specific configuration while preserving the task definition
             tasks[task_name] = f"custom|{task_config}|sglang"
+    else:
+        # Keep original task configurations for vLLM backend
+        pass
     return list(tasks.keys())
 
 

From 5180fc6c6c989efd26c2aeeb843bdddd93b370f6 Mon Sep 17 00:00:00 2001
From: Jhin <47354855+jhinpan@users.noreply.github.com>
Date: Mon, 3 Feb 2025 13:48:46 -0600
Subject: [PATCH 3/4] Add testing script for backend inference

---
 scripts/test_sampling.py       | 98 ++++++++++++++++++++++++++++++++++
 scripts/test_sglang_backend.py | 71 ++++++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 scripts/test_sampling.py
 create mode 100644 scripts/test_sglang_backend.py

diff --git a/scripts/test_sampling.py b/scripts/test_sampling.py
new file mode 100644
index 00000000..66a0a92c
--- /dev/null
+++ b/scripts/test_sampling.py
@@ -0,0 +1,98 @@
+from datasets import Dataset
+from open_r1.generate import build_distilabel_pipeline
+
+# To run this test:
+# First ensure both backends are running:
+# For vLLM:
+#   python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --port 8000
+#
+# For SGLang (in a different terminal):
+#   sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8001
+#
+# Then run: python test_sampling.py
+#
+# This test script:
+# - Tests four different sampling configurations:
+#   - High temperature (more random)
+#   - Low temperature (more deterministic)
+#   - Top-p sampling
+#   - Combined temperature and top-p
+# - Generates multiple responses for each configuration to observe variation
+# - Uses a single prompt that should produce creative, varied responses to make sampling differences more apparent
+
+
+def test_sampling_behavior(backend: str):
+    # Single prompt to test sampling variations
+    test_data = {
+        "prompt": [
+            "Write a creative color name and briefly describe it.",  # This prompt should generate varied responses with sampling
+        ]
+    }
+    test_dataset = Dataset.from_dict(test_data)
+
+    # Test different sampling configurations
+    sampling_configs = [
+        {
+            "name": "High Temperature",
+            "params": {"temperature": 0.9, "top_p": None}
+        },
+        {
+            "name": "Low Temperature",
+            "params": {"temperature": 0.1, "top_p": None}
+        },
+        {
+            "name": "Top-P Sampling",
+            "params": {"temperature": None, "top_p": 0.9}
+        },
+        {
+            "name": "Combined Sampling",
+            "params": {"temperature": 0.7, "top_p": 0.9}
+        },
+    ]
+
+    print(f"\nTesting {backend} backend sampling behavior:")
+    print("=" * 50)
+
+    for config in sampling_configs:
+        print(f"\nTesting {config['name']}:")
+        print("-" * 30)
+
+        # Create pipeline with specific sampling parameters
+        pipeline = build_distilabel_pipeline(
+            model="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model
+            backend=backend,
+            base_url="http://localhost:8000",
+            prompt_column="prompt",
+            temperature=config['params']['temperature'],
+            top_p=config['params']['top_p'],
+            max_new_tokens=50,  # Keep short for testing
+            num_generations=3,  # Generate multiple responses to see variation
+            input_batch_size=1,
+            timeout=30,
+        )
+
+        # Run inference
+        results = pipeline.run(
+            dataset=test_dataset,
+            dataset_batch_size=1,
+            use_cache=False,
+        )
+
+        # Print all generations for this configuration
+        for item in results:
+            print("\nGenerations:")
+            for i, gen in enumerate(item['generations'], 1):
+                print(f"  {i}. {gen}")
+
+
+def main():
+    # Test both backends
+    for backend in ["vllm", "sglang"]:
+        try:
+            test_sampling_behavior(backend)
+        except Exception as e:
+            print(f"\nError testing {backend} backend: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/test_sglang_backend.py b/scripts/test_sglang_backend.py
new file mode 100644
index 00000000..e514ba28
--- /dev/null
+++ b/scripts/test_sglang_backend.py
@@ -0,0 +1,71 @@
+from datasets import Dataset
+from open_r1.generate import build_distilabel_pipeline
+
+# 1. First, make sure you have a SGLang server running:
+# ```bash
+# # Example of starting SGLang server (adjust according to your setup)
+# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8000
+# ```
+# Then run the test script:
+# ```bash
+# python test_sglang_backend.py
+# ```
+
+# Some key points about the implementation:
+# The test uses a tiny dataset with just 2 prompts to verify functionality quickly
+# We're using small values for batch size and max_new_tokens to keep resource usage low
+# The timeout is reduced to 30 seconds for faster testing
+# Need to adjust the model name and server URL according to your setup
+# To verify different aspects, we may:
+# - Test error handling:
+# - Test different generation parameters:
+# - Test batch processing:
+
+# The main differences between vLLM and SGLang backends in your current implementation are:
+# The API interface (OpenAI-compatible vs SGLang native)
+# Parameter handling (generation_kwargs are passed differently)
+# If you encounter any issues, the most likely points of failure would be:
+# API compatibility
+# Parameter mapping between backends
+# Response format differences
+
+
+def test_sglang_backend():
+    # Create a tiny test dataset
+    test_data = {
+        "prompt": [
+            "What is 2+2?",
+            "Write a one-sentence story.",
+        ]
+    }
+    test_dataset = Dataset.from_dict(test_data)
+
+    # Initialize the pipeline with SGLang backend
+    pipeline = build_distilabel_pipeline(
+        model="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model
+        backend="sglang",
+        base_url="http://localhost:8000",  # Adjust to your SGLang server URL
+        prompt_column="prompt",
+        temperature=0.7,
+        max_new_tokens=100,  # Smaller for testing
+        num_generations=1,
+        input_batch_size=2,  # Small batch size for testing
+        timeout=30,  # Shorter timeout for testing
+    )
+
+    # Run inference
+    print("Running inference...")
+    results = pipeline.run(
+        dataset=test_dataset,
+        dataset_batch_size=2,
+        use_cache=False,
+    )
+    
+    # Print results
+    print("\nGenerated responses:")
+    for i, item in enumerate(results):
+        print(f"\nPrompt {i+1}: {test_data['prompt'][i]}")
+        print(f"Response: {item['generations'][0]}")
+
+if __name__ == "__main__":
+    test_sglang_backend()
\ No newline at end of file

From 296418bc3bab6cf887e192ea5548dc1e9858fc68 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochenyang@ucla.edu>
Date: Wed, 5 Feb 2025 14:24:03 -0800
Subject: [PATCH 4/4] Stash changes made in uclaml03

---
 scripts/test_generate.py       | 123 ++++++++++++++++++++++
 scripts/test_sampling.py       |  98 ------------------
 scripts/test_sglang_backend.py |  71 -------------
 src/open_r1/generate.py        | 180 ++++++++++++++++++++-------------
 src/open_r1/pipeline.py        |  43 ++++++++
 src/open_r1/sft.py             |   2 +-
 6 files changed, 274 insertions(+), 243 deletions(-)
 create mode 100644 scripts/test_generate.py
 delete mode 100644 scripts/test_sampling.py
 delete mode 100644 scripts/test_sglang_backend.py
 mode change 100644 => 100755 src/open_r1/generate.py
 create mode 100644 src/open_r1/pipeline.py

diff --git a/scripts/test_generate.py b/scripts/test_generate.py
new file mode 100644
index 00000000..f2c632de
--- /dev/null
+++ b/scripts/test_generate.py
@@ -0,0 +1,123 @@
+import pytest
+import uuid
+from datasets import Dataset
+from open_r1.generate import build_distilabel_pipeline
+
+@pytest.fixture
+def test_dataset():
+    """Create a small test dataset."""
+    return Dataset.from_dict({
+        "prompt": [
+            "What is 2+2?",
+            "Explain quantum computing in one sentence.",
+            "Write a haiku about programming."
+        ]
+    })
+
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_basic_inference(test_dataset, backend, batch_size):
+    """Test basic inference capabilities with different batch sizes."""
+    unique_id = str(uuid.uuid4())[:8]
+    
+    generation_kwargs = {
+        "temperature": 0.0,  # Deterministic for testing
+        "max_new_tokens": 50,
+        "top_p": 1.0
+    }
+    
+    pipeline = build_distilabel_pipeline(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        backend=backend,
+        prompt_column="prompt",
+        generation_kwargs=generation_kwargs,
+        input_batch_size=batch_size,
+        unique_id=unique_id,
+        num_generations=1
+    )
+    
+    result = pipeline.run(
+        dataset=test_dataset,
+        dataset_batch_size=batch_size,
+        use_cache=False,
+    )
+    
+    # Basic validation
+    assert len(result) == len(test_dataset)
+    assert "text" in result.features
+    assert all(isinstance(gen, str) and len(gen) > 0 for gen in result["text"])
+
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+def test_error_handling(test_dataset, backend):
+    """Test error handling with invalid configurations."""
+    with pytest.raises(ValueError):
+        # Test with invalid batch size
+        pipeline = build_distilabel_pipeline(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            backend=backend,
+            input_batch_size=0
+        )
+
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+def test_multiple_generations(test_dataset, backend):
+    """Test multiple generations per prompt."""
+    unique_id = str(uuid.uuid4())[:8]
+    
+    generation_kwargs = {
+        "temperature": 0.7,  # Non-deterministic for multiple generations
+        "max_new_tokens": 50,
+        "top_p": 0.95
+    }
+    
+    pipeline = build_distilabel_pipeline(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        backend=backend,
+        prompt_column="prompt",
+        generation_kwargs=generation_kwargs,
+        input_batch_size=1,
+        unique_id=unique_id,
+        num_generations=2  # Generate 2 responses per prompt
+    )
+    
+    result = pipeline.run(
+        dataset=test_dataset,
+        dataset_batch_size=1,
+        use_cache=False,
+    )
+    
+    # Verify multiple generations
+    assert len(result) == len(test_dataset) * 2
+    assert "text" in result.features
+
+@pytest.mark.parametrize("backend", ["vllm", "sglang"])
+def test_generation_parameters(test_dataset, backend):
+    """Test different generation parameters."""
+    generation_configs = [
+        {"temperature": 0.0, "max_new_tokens": 10, "top_p": 1.0},
+        {"temperature": 0.8, "max_new_tokens": 100, "top_p": 0.9},
+    ]
+    
+    for gen_kwargs in generation_configs:
+        pipeline = build_distilabel_pipeline(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            backend=backend,
+            prompt_column="prompt",
+            generation_kwargs=gen_kwargs,
+            input_batch_size=1
+        )
+        
+        result = pipeline.run(
+            dataset=test_dataset,
+            dataset_batch_size=1,
+            use_cache=False,
+        )
+        
+        assert len(result) == len(test_dataset)
+        if gen_kwargs["temperature"] == 0.0:
+            # For deterministic generation, outputs should be identical across runs
+            second_result = pipeline.run(
+                dataset=test_dataset,
+                dataset_batch_size=1,
+                use_cache=False,
+            )
+            assert all(r1 == r2 for r1, r2 in zip(result["text"], second_result["text"]))
\ No newline at end of file
diff --git a/scripts/test_sampling.py b/scripts/test_sampling.py
deleted file mode 100644
index 66a0a92c..00000000
--- a/scripts/test_sampling.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from datasets import Dataset
-from open_r1.generate import build_distilabel_pipeline
-
-# To run this test:
-# First ensure both backends are running:
-# For vLLM:
-#   python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --port 8000
-#
-# For SGLang (in a different terminal):
-#   sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8001
-#
-# Then run: python test_sampling.py
-#
-# This test script:
-# - Tests four different sampling configurations:
-#   - High temperature (more random)
-#   - Low temperature (more deterministic)
-#   - Top-p sampling
-#   - Combined temperature and top-p
-# - Generates multiple responses for each configuration to observe variation
-# - Uses a single prompt that should produce creative, varied responses to make sampling differences more apparent
-
-
-def test_sampling_behavior(backend: str):
-    # Single prompt to test sampling variations
-    test_data = {
-        "prompt": [
-            "Write a creative color name and briefly describe it.",  # This prompt should generate varied responses with sampling
-        ]
-    }
-    test_dataset = Dataset.from_dict(test_data)
-
-    # Test different sampling configurations
-    sampling_configs = [
-        {
-            "name": "High Temperature",
-            "params": {"temperature": 0.9, "top_p": None}
-        },
-        {
-            "name": "Low Temperature",
-            "params": {"temperature": 0.1, "top_p": None}
-        },
-        {
-            "name": "Top-P Sampling",
-            "params": {"temperature": None, "top_p": 0.9}
-        },
-        {
-            "name": "Combined Sampling",
-            "params": {"temperature": 0.7, "top_p": 0.9}
-        },
-    ]
-
-    print(f"\nTesting {backend} backend sampling behavior:")
-    print("=" * 50)
-
-    for config in sampling_configs:
-        print(f"\nTesting {config['name']}:")
-        print("-" * 30)
-
-        # Create pipeline with specific sampling parameters
-        pipeline = build_distilabel_pipeline(
-            model="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model
-            backend=backend,
-            base_url="http://localhost:8000",
-            prompt_column="prompt",
-            temperature=config['params']['temperature'],
-            top_p=config['params']['top_p'],
-            max_new_tokens=50,  # Keep short for testing
-            num_generations=3,  # Generate multiple responses to see variation
-            input_batch_size=1,
-            timeout=30,
-        )
-
-        # Run inference
-        results = pipeline.run(
-            dataset=test_dataset,
-            dataset_batch_size=1,
-            use_cache=False,
-        )
-
-        # Print all generations for this configuration
-        for item in results:
-            print("\nGenerations:")
-            for i, gen in enumerate(item['generations'], 1):
-                print(f"  {i}. {gen}")
-
-
-def main():
-    # Test both backends
-    for backend in ["vllm", "sglang"]:
-        try:
-            test_sampling_behavior(backend)
-        except Exception as e:
-            print(f"\nError testing {backend} backend: {str(e)}")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/test_sglang_backend.py b/scripts/test_sglang_backend.py
deleted file mode 100644
index e514ba28..00000000
--- a/scripts/test_sglang_backend.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from datasets import Dataset
-from open_r1.generate import build_distilabel_pipeline
-
-# 1. First, make sure you have a SGLang server running:
-# ```bash
-# # Example of starting SGLang server (adjust according to your setup)
-# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8000
-# ```
-# Then run the test script:
-# ```bash
-# python test_sglang_backend.py
-# ```
-
-# Some key points about the implementation:
-# The test uses a tiny dataset with just 2 prompts to verify functionality quickly
-# We're using small values for batch size and max_new_tokens to keep resource usage low
-# The timeout is reduced to 30 seconds for faster testing
-# Need to adjust the model name and server URL according to your setup
-# To verify different aspects, we may:
-# - Test error handling:
-# - Test different generation parameters:
-# - Test batch processing:
-
-# The main differences between vLLM and SGLang backends in your current implementation are:
-# The API interface (OpenAI-compatible vs SGLang native)
-# Parameter handling (generation_kwargs are passed differently)
-# If you encounter any issues, the most likely points of failure would be:
-# API compatibility
-# Parameter mapping between backends
-# Response format differences
-
-
-def test_sglang_backend():
-    # Create a tiny test dataset
-    test_data = {
-        "prompt": [
-            "What is 2+2?",
-            "Write a one-sentence story.",
-        ]
-    }
-    test_dataset = Dataset.from_dict(test_data)
-
-    # Initialize the pipeline with SGLang backend
-    pipeline = build_distilabel_pipeline(
-        model="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model
-        backend="sglang",
-        base_url="http://localhost:8000",  # Adjust to your SGLang server URL
-        prompt_column="prompt",
-        temperature=0.7,
-        max_new_tokens=100,  # Smaller for testing
-        num_generations=1,
-        input_batch_size=2,  # Small batch size for testing
-        timeout=30,  # Shorter timeout for testing
-    )
-
-    # Run inference
-    print("Running inference...")
-    results = pipeline.run(
-        dataset=test_dataset,
-        dataset_batch_size=2,
-        use_cache=False,
-    )
-    
-    # Print results
-    print("\nGenerated responses:")
-    for i, item in enumerate(results):
-        print(f"\nPrompt {i+1}: {test_data['prompt'][i]}")
-        print(f"Response: {item['generations'][0]}")
-
-if __name__ == "__main__":
-    test_sglang_backend()
\ No newline at end of file
diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py
old mode 100644
new mode 100755
index 9af96868..a64ea0bf
--- a/src/open_r1/generate.py
+++ b/src/open_r1/generate.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 from typing import Optional
-
+from typing import Union
 from distilabel.llms import OpenAILLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import StepResources
 from distilabel.steps.tasks import TextGeneration
-from typing import Union
-from sglang import RuntimeClient
+import sglang as sgl
 
 
 class LLMBackend:
@@ -32,25 +31,47 @@ def get_llm(
         base_url: str = "http://localhost:8000/v1",
         timeout: int = 900,
         max_retries: int = 0,
-        generation_kwargs: dict = None
-    ) -> Union[OpenAILLM, RuntimeClient]:
-        if self.backend == "vllm":
-            return OpenAILLM(
-                base_url=base_url,
-                api_key="something",
-                model=model,
-                timeout=timeout,
-                max_retries=max_retries,
-                generation_kwargs=generation_kwargs,
-            )
-        elif self.backend == "sglang":
-            return RuntimeClient(
-                model=model,
-                api_base=base_url,
-                timeout=timeout,
-                max_retries=max_retries,
-                **generation_kwargs
+        generation_kwargs: dict = None,
+    ) -> Union[OpenAILLM, sgl.Engine]:
+        """Get LLM instance based on backend."""
+        if generation_kwargs is None:
+            generation_kwargs = {}
+
+        if self.backend == "sglang":
+            # SGLang engine initialization with correct parameters
+            sglang_kwargs = {
+                "model_path": model,
+                "log_level": "error",
+                "device": "cuda",
+                "dtype": "float16",
+                "max_model_len": 4096,  # Add reasonable default max length
+            }
+            
+            # Create engine with basic parameters
+            engine = sgl.Engine(**sglang_kwargs)
+            
+            # Set generation config after initialization
+            max_tokens = generation_kwargs.get("max_new_tokens", 512)
+            engine.set_generation_params(
+                max_tokens=max_tokens,
+                temperature=generation_kwargs.get("temperature", 0.7),
+                top_p=generation_kwargs.get("top_p", 0.95)
             )
+            
+            return engine
+        
+        elif self.backend == "vllm":
+            try:
+                return OpenAILLM(
+                    base_url=base_url,
+                    api_key="dummy-key",  # vLLM doesn't require real key
+                    model=model,
+                    timeout=timeout,
+                    max_retries=max_retries,
+                    generation_kwargs=generation_kwargs,
+                )
+            except Exception as e:
+                raise ConnectionError(f"Failed to connect to vLLM server at {base_url}: {str(e)}")
         else:
             raise ValueError(f"Unknown backend: {self.backend}")
 
@@ -61,26 +82,40 @@ def build_distilabel_pipeline(
     base_url: str = "http://localhost:8000/v1",
     prompt_column: Optional[str] = None,
     prompt_template: str = "{{ instruction }}",
-    temperature: Optional[float] = None,
-    top_p: Optional[float] = None,
-    max_new_tokens: int = 8192,
+    generation_kwargs: Optional[dict] = None,
     num_generations: int = 1,
     input_batch_size: int = 64,
     client_replicas: int = 1,
     timeout: int = 900,
     retries: int = 0,
+    unique_id: Optional[str] = None,
 ) -> Pipeline:
-    generation_kwargs = {"max_new_tokens": max_new_tokens}
-
-    if temperature is not None:
-        generation_kwargs["temperature"] = temperature
+    """Build a distilabel pipeline for text generation.
+    
+    Args:
+        model (str): Model identifier or path
+        backend (str, optional): Backend to use ("vllm" or "sglang"). Defaults to "vllm".
+        base_url (str, optional): Base URL for vLLM server. Defaults to "http://localhost:8000/v1".
+        prompt_column (str, optional): Column name containing prompts. Defaults to None.
+        prompt_template (str, optional): Template for formatting prompts. Defaults to "{{ instruction }}".
+        generation_kwargs (dict, optional): Generation parameters. Defaults to None.
+        num_generations (int, optional): Number of generations per prompt. Defaults to 1.
+        input_batch_size (int, optional): Batch size for processing. Defaults to 64.
+        client_replicas (int, optional): Number of client replicas. Defaults to 1.
+        timeout (int, optional): Timeout in seconds. Defaults to 900.
+        retries (int, optional): Number of retries. Defaults to 0.
+        unique_id (str, optional): Unique identifier for the pipeline. Defaults to None.
 
-    if top_p is not None:
-        generation_kwargs["top_p"] = top_p
+    Returns:
+        Pipeline: Configured distilabel pipeline
+    """
+    if generation_kwargs is None:
+        generation_kwargs = {}
 
     llm_backend = LLMBackend(backend=backend)
-
-    with Pipeline().ray() as pipeline:
+    pipeline_name = f"pipeline_{unique_id}" if unique_id else "pipeline"
+    
+    with Pipeline(name=pipeline_name).ray() as pipeline:
         TextGeneration(
             llm=llm_backend.get_llm(
                 model,
@@ -102,10 +137,9 @@ def build_distilabel_pipeline(
 
 if __name__ == "__main__":
     import argparse
-
     from datasets import load_dataset
 
-    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1")
+    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses")
     parser.add_argument(
         "--hf-dataset",
         type=str,
@@ -128,6 +162,7 @@ def build_distilabel_pipeline(
         "--prompt-column",
         type=str,
         default="prompt",
+        help="Column containing prompts",
     )
     parser.add_argument(
         "--prompt-template",
@@ -142,96 +177,95 @@ def build_distilabel_pipeline(
         help="Model name to use for generation",
     )
     parser.add_argument(
-        "--vllm-server-url",
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=["vllm", "sglang"],
+        help="Backend to use for generation",
+    )
+    parser.add_argument(
+        "--base-url",
         type=str,
         default="http://localhost:8000/v1",
-        help="URL of the vLLM server",
+        help="Base URL for the backend server",
     )
     parser.add_argument(
         "--temperature",
         type=float,
+        default=0.7,
         help="Temperature for generation",
     )
     parser.add_argument(
         "--top-p",
         type=float,
+        default=0.95,
         help="Top-p value for generation",
     )
     parser.add_argument(
         "--max-new-tokens",
         type=int,
-        default=8192,
+        default=512,
         help="Maximum number of new tokens to generate",
     )
-    parser.add_argument(
-        "--num-generations",
-        type=int,
-        default=1,
-        help="Number of generations per problem",
-    )
     parser.add_argument(
         "--input-batch-size",
         type=int,
-        default=64,
-        help="Batch size for input processing",
+        default=32,
+        help="Batch size for processing inputs",
     )
     parser.add_argument(
         "--client-replicas",
         type=int,
         default=1,
-        help="Number of client replicas for parallel processing",
+        help="Number of client replicas",
     )
     parser.add_argument(
         "--timeout",
         type=int,
-        default=600,
-        help="Request timeout in seconds (default: 600)",
+        default=900,
+        help="Timeout in seconds",
     )
     parser.add_argument(
         "--retries",
         type=int,
         default=0,
-        help="Number of retries for failed requests (default: 0)",
+        help="Number of retries",
     )
     parser.add_argument(
         "--hf-output-dataset",
         type=str,
-        required=False,
-        help="HuggingFace repo to push results to",
+        help="HuggingFace dataset name to push results to",
     )
     parser.add_argument(
         "--private",
         action="store_true",
-        help="Whether to make the output dataset private when pushing to HF Hub",
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="vllm",
-        choices=["vllm", "sglang"],
-        help="Backend to use for generation (vllm or sglang)",
+        help="Whether to make the output dataset private",
     )
 
     args = parser.parse_args()
 
-    print("\nRunning with arguments:")
-    for arg, value in vars(args).items():
-        print(f"  {arg}: {value}")
-    print()
+    # Load dataset
+    dataset = load_dataset(
+        args.hf_dataset,
+        args.hf_dataset_config,
+        split=args.hf_dataset_split,
+    )
 
-    print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...")
-    dataset = load_dataset(args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split)
-    print("Dataset loaded!")
+    # Prepare generation kwargs
+    generation_kwargs = {
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "max_new_tokens": args.max_new_tokens,
+    }
 
+    # Build and run pipeline
     pipeline = build_distilabel_pipeline(
         model=args.model,
-        base_url=args.vllm_server_url,
-        prompt_template=args.prompt_template,
+        backend=args.backend,
+        base_url=args.base_url,
         prompt_column=args.prompt_column,
-        temperature=args.temperature,
-        top_p=args.top_p,
-        max_new_tokens=args.max_new_tokens,
-        num_generations=args.num_generations,
+        prompt_template=args.prompt_template,
+        generation_kwargs=generation_kwargs,
         input_batch_size=args.input_batch_size,
         client_replicas=args.client_replicas,
         timeout=args.timeout,
@@ -249,4 +283,4 @@ def build_distilabel_pipeline(
     if args.hf_output_dataset:
         print(f"Pushing resulting dataset to '{args.hf_output_dataset}'...")
         distiset.push_to_hub(args.hf_output_dataset, private=args.private)
-        print("Dataset pushed!")
+        print("Dataset pushed!")
\ No newline at end of file
diff --git a/src/open_r1/pipeline.py b/src/open_r1/pipeline.py
new file mode 100644
index 00000000..efd8dac6
--- /dev/null
+++ b/src/open_r1/pipeline.py
@@ -0,0 +1,43 @@
+from datasets import load_dataset
+from distilabel.models import vLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps.tasks import TextGeneration
+
+
+prompt_template = """\
+You will be given a problem. Please reason step by step, and put your final answer within \boxed{}:
+{{ instruction }}"""
+
+dataset = load_dataset("AI-MO/NuminaMath-TIR", split="train").select(range(10))
+
+model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"  # Exchange with another smol distilled r1
+
+with Pipeline(
+    name="distill-qwen-7b-r1",
+    description="A pipeline to generate data from a distilled r1 model",
+) as pipeline:
+
+    llm = vLLM(
+        model=model_id,
+        tokenizer=model_id,
+        extra_kwargs={
+            "tensor_parallel_size": 1,
+            "max_model_len": 8192,
+        },
+        generation_kwargs={
+            "temperature": 0.6,
+            "max_new_tokens": 8192,
+        },
+    )
+    prompt_column = "problem"
+    text_generation = TextGeneration(
+        llm=llm,
+        template=prompt_template,
+        num_generations=4,
+        input_mappings={"instruction": prompt_column} if prompt_column is not None else {}
+    )
+
+
+if __name__ == "__main__":
+    distiset = pipeline.run(dataset=dataset)
+    distiset.push_to_hub(repo_id="JinnP/numina-deepseek-r1-qwen-7b")
\ No newline at end of file
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index d5369fc2..1a88ba6a 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -167,7 +167,7 @@ def main(script_args, training_args, model_args):
 
     # Save everything else on main process
     kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
+        # "finetuned_from": model_args.model_name_or_path, # This argument is not supported
         "dataset": list(script_args.dataset_name),
         "dataset_tags": list(script_args.dataset_name),
         "tags": ["open-r1"],