From 50f5adcc8abc0002af39228fedfc35ab68be1d82 Mon Sep 17 00:00:00 2001 From: Jhin <47354855+jhinpan@users.noreply.github.com> Date: Sun, 2 Feb 2025 13:31:10 -0600 Subject: [PATCH 1/4] Feature: Add SGLang Support to backend --- Makefile | 1 + README.md | 63 ++++++++++++++++++++++----------- setup.py | 1 + slurm/evaluate.slurm | 9 ++--- src/open_r1/generate.py | 50 ++++++++++++++++++++++++-- src/open_r1/utils/evaluation.py | 18 ++++++++-- 6 files changed, 113 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 1140e59e..5be6765f 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ quality: # Evaluation evaluate: + $(eval BACKEND := $(if $(BACKEND),$(BACKEND),vllm)) $(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \ if [ "$(PARALLEL)" = "data" ]; then \ echo "data_parallel_size=$(NUM_GPUS)"; \ diff --git a/README.md b/README.md index ebb725e2..d71a6cea 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,18 @@ *A fully open reproduction of DeepSeek-R1. This repo is a work in progress, let's build it together!* -**Table of Contents** -1. [Overview](#overview) -2. [Plan of attack](#plan-of-attack) -3. [Installation](#installation) -4. [Training models](#training-models) - - [SFT](#sft) - - [GRPO](#grpo) -5. [Evaluating models](#evaluating-models) -6. [Reproducing Deepseek's evaluation results on MATH-500](#reproducing-deepseeks-evaluation-results-on-math-500) -7. [Data generation](#data-generation) - - [Generate data from a smol distilled R1 model](#generate-data-from-a-smol-distilled-r1-model) - - [Generate data from DeepSeek-R1](#generate-data-from-deepseek-r1) +**Table of Contents** +1. [Overview](#overview) +2. [Plan of attack](#plan-of-attack) +3. [Installation](#installation) +4. [Training models](#training-models) + - [SFT](#sft) + - [GRPO](#grpo) +5. [Evaluating models](#evaluating-models) +6. [Reproducing Deepseek's evaluation results on MATH-500](#reproducing-deepseeks-evaluation-results-on-math-500) +7. [Data generation](#data-generation) + - [Generate data from a smol distilled R1 model](#generate-data-from-a-smol-distilled-r1-model) + - [Generate data from DeepSeek-R1](#generate-data-from-deepseek-r1) 8. [Contributing](#contributing) ## Overview @@ -45,6 +45,7 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee **Note: Libraries rely on CUDA 12.1. Double check your system if you get segmentation faults.** + To run the code in this project, first, create a Python virtual environment using e.g. `uv`. To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/). @@ -63,7 +64,19 @@ pip install vllm>=0.7.0 --extra-index-url https://download.pytorch.org/whl/cu121 export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] + '/nvidia/nvjitlink/lib')"):$LD_LIBRARY_PATH ``` -This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend: +This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. + +Alternatively, you can use *SGLang* as a replacement for *vLLM*: + +```shell +pip install --upgrade pip +pip install sgl-kernel --force-reinstall --no-deps +pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ +``` + +**Note:** When using SGLang, make sure to check the [FlashInfer installation doc](https://github.com/flashinfer-ai/flashinfer) to install the proper version according to your PyTorch and CUDA versions. + +You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend: ```shell pip install -e ".[dev]" @@ -109,7 +122,7 @@ To launch a Slurm job, run: sbatch --output=/path/to/logs/%x-%j.out --err=/path/to/logs/%x-%j.err slurm/sft.slurm {model} {dataset} {accelerator} ``` -Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging Face Hub, while `{accelerator}` refers to the choice of an 🤗 Accelerate config file in configs. +Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging Face Hub, while `{accelerator}` refers to the choice of an 🤗 Accelerate config file in configs. ### GRPO @@ -141,7 +154,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \ --custom-tasks src/open_r1/evaluate.py \ --use-chat-template \ --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \ - --output-dir $OUTPUT_DIR + --output-dir $OUTPUT_DIR ``` To increase throughput across multiple GPUs, use _data parallel_ as follows: @@ -157,7 +170,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \ --custom-tasks src/open_r1/evaluate.py \ --use-chat-template \ --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \ - --output-dir $OUTPUT_DIR + --output-dir $OUTPUT_DIR ``` For large models which require sharding across GPUs, use _tensor parallel_ and run: @@ -174,7 +187,7 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \ --custom-tasks src/open_r1/evaluate.py \ --use-chat-template \ --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \ - --output-dir $OUTPUT_DIR + --output-dir $OUTPUT_DIR ``` You can also launch an evaluation with `make evaluate`, specifying the model, task, and optionally the parallelism technique and number of GPUs. @@ -193,6 +206,16 @@ To use Tensor Parallelism: ```shell make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8 ``` + +To use SGLang instead of vLLM, you would then run the commands with the --backend sglang flag or BACKEND=sglang for make commands. For example: + +To evaluate on a single GPU: +```shell +make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BackEND=sglang +``` + + + ## Reproducing Deepseek's evaluation results on MATH-500 We are able to reproduce Deepseek's reported results on the MATH-500 Benchmark: | Model | MATH-500 (HF lighteval) | MATH-500 (DeepSeek Reported) | @@ -222,7 +245,7 @@ sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-70B math_500 t ### Generate data from a smol distilled R1 model -The following example can be run in 1xH100. +The following example can be run in 1xH100. First install the following dependencies: ```shell @@ -265,7 +288,7 @@ with Pipeline( ) prompt_column = "problem" text_generation = TextGeneration( - llm=llm, + llm=llm, template=prompt_template, num_generations=4, input_mappings={"instruction": prompt_column} if prompt_column is not None else {} @@ -302,7 +325,7 @@ sbatch slurm/generate.slurm \ --hf-output-dataset username/r1-dataset ``` -> [!NOTE] +> [!NOTE] > While the job is running, you can setup an SSH tunnel through the cluster login node to access the Ray dashboard from your computer running `ssh -L 8265:ray_ip_head_node:8265 `, then browsing `http://localhost:8265` ## Contributing diff --git a/setup.py b/setup.py index b3b10694..662fea28 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ "transformers @ git+https://github.com/huggingface/transformers.git@main", "trl @ git+https://github.com/huggingface/trl.git@main", "vllm>=0.7.0", + "sglang>=0.4.0", "wandb>=0.19.1", ] diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm index 5fe7f8e3..2da6a8a5 100644 --- a/slurm/evaluate.slurm +++ b/slurm/evaluate.slurm @@ -4,7 +4,7 @@ #SBATCH --ntasks-per-node=1 #SBATCH --exclusive #SBATCH --gres=gpu:8 -#SBATCH --partition=hopper-prod +#SBATCH --partition=hopper-prod #SBATCH --time=01:59:00 #SBATCH --output=./logs/evaluate/%x-%j.out #SBATCH --err=./logs/evaluate/%x-%j.err @@ -44,12 +44,13 @@ export NCCL_ASYNC_ERROR_HANDLING=1 # Be ye warned this may not work on other clusters! module load cuda/12.1 -lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \ +BACKEND=${BACKEND:-vllm} # Default to vLLM if not specified + +lighteval $BACKEND $MODEL_ARGS "custom|$TASK|0|0" \ --custom-tasks src/open_r1/evaluate.py \ --use-chat-template \ --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \ --save-details \ - --output-dir $OUTPUT_DIR - + --output-dir $OUTPUT_DIR echo "END TIME: $(date)" diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py index 40ff3b39..4e4c74f6 100644 --- a/src/open_r1/generate.py +++ b/src/open_r1/generate.py @@ -18,10 +18,46 @@ from distilabel.pipeline import Pipeline from distilabel.steps import StepResources from distilabel.steps.tasks import TextGeneration +from typing import Union +from sglang import RuntimeClient + + +class LLMBackend: + def __init__(self, backend: str = "vllm"): + self.backend = backend + + def get_llm( + self, + model: str, + base_url: str = "http://localhost:8000/v1", + timeout: int = 900, + max_retries: int = 0, + generation_kwargs: dict = None + ) -> Union[OpenAILLM, RuntimeClient]: + if self.backend == "vllm": + return OpenAILLM( + base_url=base_url, + api_key="something", + model=model, + timeout=timeout, + max_retries=max_retries, + generation_kwargs=generation_kwargs, + ) + elif self.backend == "sglang": + return RuntimeClient( + model=model, + api_base=base_url, + timeout=timeout, + max_retries=max_retries, + **generation_kwargs + ) + else: + raise ValueError(f"Unknown backend: {self.backend}") def build_distilabel_pipeline( model: str, + backend: str = "vllm", # Add backend parameter base_url: str = "http://localhost:8000/v1", prompt_column: Optional[str] = None, prompt_template: str = "{{ instruction }}", @@ -42,12 +78,13 @@ def build_distilabel_pipeline( if top_p is not None: generation_kwargs["top_p"] = top_p + llm_backend = LLMBackend(backend=backend) + with Pipeline().ray() as pipeline: TextGeneration( - llm=OpenAILLM( + llm=llm_backend.get_llm( + model, base_url=base_url, - api_key="something", - model=model, timeout=timeout, max_retries=retries, generation_kwargs=generation_kwargs, @@ -167,6 +204,13 @@ def build_distilabel_pipeline( action="store_true", help="Whether to make the output dataset private when pushing to HF Hub", ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=["vllm", "sglang"], + help="Backend to use for generation (vllm or sglang)", + ) args = parser.parse_args() diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py index 9cbac82d..2fe53372 100644 --- a/src/open_r1/utils/evaluation.py +++ b/src/open_r1/utils/evaluation.py @@ -50,8 +50,22 @@ def register_lighteval_task( register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0) -def get_lighteval_tasks(): - return list(LIGHTEVAL_TASKS.keys()) +def get_lighteval_tasks(backend: str = "vllm"): + """Get lighteval tasks with specified backend configuration. + + Args: + backend (str, optional): Backend to use for evaluation. Either "vllm" or "sglang". Defaults to "vllm". + + Returns: + List[str]: List of available task names + """ + tasks = LIGHTEVAL_TASKS.copy() + if backend == "sglang": + # Modify task configurations for SGLang backend + for task_name, task_config in tasks.items(): + # Add SGLang specific configuration while preserving the task definition + tasks[task_name] = f"custom|{task_config}|sglang" + return list(tasks.keys()) SUPPORTED_BENCHMARKS = get_lighteval_tasks() From 2210298fd28f6983e3861cc048acd0a74f94f20b Mon Sep 17 00:00:00 2001 From: Jhin <47354855+jhinpan@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:06:03 -0600 Subject: [PATCH 2/4] Some Doc Style and Coding Style Fix --- README.md | 6 +++--- setup.py | 2 +- src/open_r1/generate.py | 2 +- src/open_r1/utils/evaluation.py | 7 +++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d71a6cea..e155a499 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. -Alternatively, you can use *SGLang* as a replacement for *vLLM*: +Alternatively, you can use *SGLang* as inference backend. ```shell pip install --upgrade pip @@ -207,11 +207,11 @@ To use Tensor Parallelism: make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8 ``` -To use SGLang instead of vLLM, you would then run the commands with the --backend sglang flag or BACKEND=sglang for make commands. For example: +To use SGLang, you would then run the commands with the `--backend sglang` flag or `BACKEND=sglang` for make commands. For example: To evaluate on a single GPU: ```shell -make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BackEND=sglang +make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 BACKEND=sglang ``` diff --git a/setup.py b/setup.py index 662fea28..b8552983 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ "transformers @ git+https://github.com/huggingface/transformers.git@main", "trl @ git+https://github.com/huggingface/trl.git@main", "vllm>=0.7.0", - "sglang>=0.4.0", + "sglang>=0.4.1", "wandb>=0.19.1", ] diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py index 4e4c74f6..9af96868 100644 --- a/src/open_r1/generate.py +++ b/src/open_r1/generate.py @@ -57,7 +57,7 @@ def get_llm( def build_distilabel_pipeline( model: str, - backend: str = "vllm", # Add backend parameter + backend: str = "vllm", base_url: str = "http://localhost:8000/v1", prompt_column: Optional[str] = None, prompt_template: str = "{{ instruction }}", diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py index 2fe53372..29e815af 100644 --- a/src/open_r1/utils/evaluation.py +++ b/src/open_r1/utils/evaluation.py @@ -52,10 +52,10 @@ def register_lighteval_task( def get_lighteval_tasks(backend: str = "vllm"): """Get lighteval tasks with specified backend configuration. - + Args: backend (str, optional): Backend to use for evaluation. Either "vllm" or "sglang". Defaults to "vllm". - + Returns: List[str]: List of available task names """ @@ -65,6 +65,9 @@ def get_lighteval_tasks(backend: str = "vllm"): for task_name, task_config in tasks.items(): # Add SGLang specific configuration while preserving the task definition tasks[task_name] = f"custom|{task_config}|sglang" + else: + # Keep original task configurations for vLLM backend + pass return list(tasks.keys()) From 5180fc6c6c989efd26c2aeeb843bdddd93b370f6 Mon Sep 17 00:00:00 2001 From: Jhin <47354855+jhinpan@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:48:46 -0600 Subject: [PATCH 3/4] Add testing script for backend inference --- scripts/test_sampling.py | 98 ++++++++++++++++++++++++++++++++++ scripts/test_sglang_backend.py | 71 ++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 scripts/test_sampling.py create mode 100644 scripts/test_sglang_backend.py diff --git a/scripts/test_sampling.py b/scripts/test_sampling.py new file mode 100644 index 00000000..66a0a92c --- /dev/null +++ b/scripts/test_sampling.py @@ -0,0 +1,98 @@ +from datasets import Dataset +from open_r1.generate import build_distilabel_pipeline + +# To run this test: +# First ensure both backends are running: +# For vLLM: +# python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --port 8000 +# +# For SGLang (in a different terminal): +# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8001 +# +# Then run: python test_sampling.py +# +# This test script: +# - Tests four different sampling configurations: +# - High temperature (more random) +# - Low temperature (more deterministic) +# - Top-p sampling +# - Combined temperature and top-p +# - Generates multiple responses for each configuration to observe variation +# - Uses a single prompt that should produce creative, varied responses to make sampling differences more apparent + + +def test_sampling_behavior(backend: str): + # Single prompt to test sampling variations + test_data = { + "prompt": [ + "Write a creative color name and briefly describe it.", # This prompt should generate varied responses with sampling + ] + } + test_dataset = Dataset.from_dict(test_data) + + # Test different sampling configurations + sampling_configs = [ + { + "name": "High Temperature", + "params": {"temperature": 0.9, "top_p": None} + }, + { + "name": "Low Temperature", + "params": {"temperature": 0.1, "top_p": None} + }, + { + "name": "Top-P Sampling", + "params": {"temperature": None, "top_p": 0.9} + }, + { + "name": "Combined Sampling", + "params": {"temperature": 0.7, "top_p": 0.9} + }, + ] + + print(f"\nTesting {backend} backend sampling behavior:") + print("=" * 50) + + for config in sampling_configs: + print(f"\nTesting {config['name']}:") + print("-" * 30) + + # Create pipeline with specific sampling parameters + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", # Replace with your model + backend=backend, + base_url="http://localhost:8000", + prompt_column="prompt", + temperature=config['params']['temperature'], + top_p=config['params']['top_p'], + max_new_tokens=50, # Keep short for testing + num_generations=3, # Generate multiple responses to see variation + input_batch_size=1, + timeout=30, + ) + + # Run inference + results = pipeline.run( + dataset=test_dataset, + dataset_batch_size=1, + use_cache=False, + ) + + # Print all generations for this configuration + for item in results: + print("\nGenerations:") + for i, gen in enumerate(item['generations'], 1): + print(f" {i}. {gen}") + + +def main(): + # Test both backends + for backend in ["vllm", "sglang"]: + try: + test_sampling_behavior(backend) + except Exception as e: + print(f"\nError testing {backend} backend: {str(e)}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/test_sglang_backend.py b/scripts/test_sglang_backend.py new file mode 100644 index 00000000..e514ba28 --- /dev/null +++ b/scripts/test_sglang_backend.py @@ -0,0 +1,71 @@ +from datasets import Dataset +from open_r1.generate import build_distilabel_pipeline + +# 1. First, make sure you have a SGLang server running: +# ```bash +# # Example of starting SGLang server (adjust according to your setup) +# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8000 +# ``` +# Then run the test script: +# ```bash +# python test_sglang_backend.py +# ``` + +# Some key points about the implementation: +# The test uses a tiny dataset with just 2 prompts to verify functionality quickly +# We're using small values for batch size and max_new_tokens to keep resource usage low +# The timeout is reduced to 30 seconds for faster testing +# Need to adjust the model name and server URL according to your setup +# To verify different aspects, we may: +# - Test error handling: +# - Test different generation parameters: +# - Test batch processing: + +# The main differences between vLLM and SGLang backends in your current implementation are: +# The API interface (OpenAI-compatible vs SGLang native) +# Parameter handling (generation_kwargs are passed differently) +# If you encounter any issues, the most likely points of failure would be: +# API compatibility +# Parameter mapping between backends +# Response format differences + + +def test_sglang_backend(): + # Create a tiny test dataset + test_data = { + "prompt": [ + "What is 2+2?", + "Write a one-sentence story.", + ] + } + test_dataset = Dataset.from_dict(test_data) + + # Initialize the pipeline with SGLang backend + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", # Replace with your model + backend="sglang", + base_url="http://localhost:8000", # Adjust to your SGLang server URL + prompt_column="prompt", + temperature=0.7, + max_new_tokens=100, # Smaller for testing + num_generations=1, + input_batch_size=2, # Small batch size for testing + timeout=30, # Shorter timeout for testing + ) + + # Run inference + print("Running inference...") + results = pipeline.run( + dataset=test_dataset, + dataset_batch_size=2, + use_cache=False, + ) + + # Print results + print("\nGenerated responses:") + for i, item in enumerate(results): + print(f"\nPrompt {i+1}: {test_data['prompt'][i]}") + print(f"Response: {item['generations'][0]}") + +if __name__ == "__main__": + test_sglang_backend() \ No newline at end of file From 296418bc3bab6cf887e192ea5548dc1e9858fc68 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Wed, 5 Feb 2025 14:24:03 -0800 Subject: [PATCH 4/4] Stash changes made in uclaml03 --- scripts/test_generate.py | 123 ++++++++++++++++++++++ scripts/test_sampling.py | 98 ------------------ scripts/test_sglang_backend.py | 71 ------------- src/open_r1/generate.py | 180 ++++++++++++++++++++------------- src/open_r1/pipeline.py | 43 ++++++++ src/open_r1/sft.py | 2 +- 6 files changed, 274 insertions(+), 243 deletions(-) create mode 100644 scripts/test_generate.py delete mode 100644 scripts/test_sampling.py delete mode 100644 scripts/test_sglang_backend.py mode change 100644 => 100755 src/open_r1/generate.py create mode 100644 src/open_r1/pipeline.py diff --git a/scripts/test_generate.py b/scripts/test_generate.py new file mode 100644 index 00000000..f2c632de --- /dev/null +++ b/scripts/test_generate.py @@ -0,0 +1,123 @@ +import pytest +import uuid +from datasets import Dataset +from open_r1.generate import build_distilabel_pipeline + +@pytest.fixture +def test_dataset(): + """Create a small test dataset.""" + return Dataset.from_dict({ + "prompt": [ + "What is 2+2?", + "Explain quantum computing in one sentence.", + "Write a haiku about programming." + ] + }) + +@pytest.mark.parametrize("backend", ["vllm", "sglang"]) +@pytest.mark.parametrize("batch_size", [1, 2]) +def test_basic_inference(test_dataset, backend, batch_size): + """Test basic inference capabilities with different batch sizes.""" + unique_id = str(uuid.uuid4())[:8] + + generation_kwargs = { + "temperature": 0.0, # Deterministic for testing + "max_new_tokens": 50, + "top_p": 1.0 + } + + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", + backend=backend, + prompt_column="prompt", + generation_kwargs=generation_kwargs, + input_batch_size=batch_size, + unique_id=unique_id, + num_generations=1 + ) + + result = pipeline.run( + dataset=test_dataset, + dataset_batch_size=batch_size, + use_cache=False, + ) + + # Basic validation + assert len(result) == len(test_dataset) + assert "text" in result.features + assert all(isinstance(gen, str) and len(gen) > 0 for gen in result["text"]) + +@pytest.mark.parametrize("backend", ["vllm", "sglang"]) +def test_error_handling(test_dataset, backend): + """Test error handling with invalid configurations.""" + with pytest.raises(ValueError): + # Test with invalid batch size + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", + backend=backend, + input_batch_size=0 + ) + +@pytest.mark.parametrize("backend", ["vllm", "sglang"]) +def test_multiple_generations(test_dataset, backend): + """Test multiple generations per prompt.""" + unique_id = str(uuid.uuid4())[:8] + + generation_kwargs = { + "temperature": 0.7, # Non-deterministic for multiple generations + "max_new_tokens": 50, + "top_p": 0.95 + } + + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", + backend=backend, + prompt_column="prompt", + generation_kwargs=generation_kwargs, + input_batch_size=1, + unique_id=unique_id, + num_generations=2 # Generate 2 responses per prompt + ) + + result = pipeline.run( + dataset=test_dataset, + dataset_batch_size=1, + use_cache=False, + ) + + # Verify multiple generations + assert len(result) == len(test_dataset) * 2 + assert "text" in result.features + +@pytest.mark.parametrize("backend", ["vllm", "sglang"]) +def test_generation_parameters(test_dataset, backend): + """Test different generation parameters.""" + generation_configs = [ + {"temperature": 0.0, "max_new_tokens": 10, "top_p": 1.0}, + {"temperature": 0.8, "max_new_tokens": 100, "top_p": 0.9}, + ] + + for gen_kwargs in generation_configs: + pipeline = build_distilabel_pipeline( + model="meta-llama/Llama-2-7b-chat-hf", + backend=backend, + prompt_column="prompt", + generation_kwargs=gen_kwargs, + input_batch_size=1 + ) + + result = pipeline.run( + dataset=test_dataset, + dataset_batch_size=1, + use_cache=False, + ) + + assert len(result) == len(test_dataset) + if gen_kwargs["temperature"] == 0.0: + # For deterministic generation, outputs should be identical across runs + second_result = pipeline.run( + dataset=test_dataset, + dataset_batch_size=1, + use_cache=False, + ) + assert all(r1 == r2 for r1, r2 in zip(result["text"], second_result["text"])) \ No newline at end of file diff --git a/scripts/test_sampling.py b/scripts/test_sampling.py deleted file mode 100644 index 66a0a92c..00000000 --- a/scripts/test_sampling.py +++ /dev/null @@ -1,98 +0,0 @@ -from datasets import Dataset -from open_r1.generate import build_distilabel_pipeline - -# To run this test: -# First ensure both backends are running: -# For vLLM: -# python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --port 8000 -# -# For SGLang (in a different terminal): -# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8001 -# -# Then run: python test_sampling.py -# -# This test script: -# - Tests four different sampling configurations: -# - High temperature (more random) -# - Low temperature (more deterministic) -# - Top-p sampling -# - Combined temperature and top-p -# - Generates multiple responses for each configuration to observe variation -# - Uses a single prompt that should produce creative, varied responses to make sampling differences more apparent - - -def test_sampling_behavior(backend: str): - # Single prompt to test sampling variations - test_data = { - "prompt": [ - "Write a creative color name and briefly describe it.", # This prompt should generate varied responses with sampling - ] - } - test_dataset = Dataset.from_dict(test_data) - - # Test different sampling configurations - sampling_configs = [ - { - "name": "High Temperature", - "params": {"temperature": 0.9, "top_p": None} - }, - { - "name": "Low Temperature", - "params": {"temperature": 0.1, "top_p": None} - }, - { - "name": "Top-P Sampling", - "params": {"temperature": None, "top_p": 0.9} - }, - { - "name": "Combined Sampling", - "params": {"temperature": 0.7, "top_p": 0.9} - }, - ] - - print(f"\nTesting {backend} backend sampling behavior:") - print("=" * 50) - - for config in sampling_configs: - print(f"\nTesting {config['name']}:") - print("-" * 30) - - # Create pipeline with specific sampling parameters - pipeline = build_distilabel_pipeline( - model="meta-llama/Llama-2-7b-chat-hf", # Replace with your model - backend=backend, - base_url="http://localhost:8000", - prompt_column="prompt", - temperature=config['params']['temperature'], - top_p=config['params']['top_p'], - max_new_tokens=50, # Keep short for testing - num_generations=3, # Generate multiple responses to see variation - input_batch_size=1, - timeout=30, - ) - - # Run inference - results = pipeline.run( - dataset=test_dataset, - dataset_batch_size=1, - use_cache=False, - ) - - # Print all generations for this configuration - for item in results: - print("\nGenerations:") - for i, gen in enumerate(item['generations'], 1): - print(f" {i}. {gen}") - - -def main(): - # Test both backends - for backend in ["vllm", "sglang"]: - try: - test_sampling_behavior(backend) - except Exception as e: - print(f"\nError testing {backend} backend: {str(e)}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/test_sglang_backend.py b/scripts/test_sglang_backend.py deleted file mode 100644 index e514ba28..00000000 --- a/scripts/test_sglang_backend.py +++ /dev/null @@ -1,71 +0,0 @@ -from datasets import Dataset -from open_r1.generate import build_distilabel_pipeline - -# 1. First, make sure you have a SGLang server running: -# ```bash -# # Example of starting SGLang server (adjust according to your setup) -# sglang start-server --model meta-llama/Llama-2-7b-chat-hf --port 8000 -# ``` -# Then run the test script: -# ```bash -# python test_sglang_backend.py -# ``` - -# Some key points about the implementation: -# The test uses a tiny dataset with just 2 prompts to verify functionality quickly -# We're using small values for batch size and max_new_tokens to keep resource usage low -# The timeout is reduced to 30 seconds for faster testing -# Need to adjust the model name and server URL according to your setup -# To verify different aspects, we may: -# - Test error handling: -# - Test different generation parameters: -# - Test batch processing: - -# The main differences between vLLM and SGLang backends in your current implementation are: -# The API interface (OpenAI-compatible vs SGLang native) -# Parameter handling (generation_kwargs are passed differently) -# If you encounter any issues, the most likely points of failure would be: -# API compatibility -# Parameter mapping between backends -# Response format differences - - -def test_sglang_backend(): - # Create a tiny test dataset - test_data = { - "prompt": [ - "What is 2+2?", - "Write a one-sentence story.", - ] - } - test_dataset = Dataset.from_dict(test_data) - - # Initialize the pipeline with SGLang backend - pipeline = build_distilabel_pipeline( - model="meta-llama/Llama-2-7b-chat-hf", # Replace with your model - backend="sglang", - base_url="http://localhost:8000", # Adjust to your SGLang server URL - prompt_column="prompt", - temperature=0.7, - max_new_tokens=100, # Smaller for testing - num_generations=1, - input_batch_size=2, # Small batch size for testing - timeout=30, # Shorter timeout for testing - ) - - # Run inference - print("Running inference...") - results = pipeline.run( - dataset=test_dataset, - dataset_batch_size=2, - use_cache=False, - ) - - # Print results - print("\nGenerated responses:") - for i, item in enumerate(results): - print(f"\nPrompt {i+1}: {test_data['prompt'][i]}") - print(f"Response: {item['generations'][0]}") - -if __name__ == "__main__": - test_sglang_backend() \ No newline at end of file diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py old mode 100644 new mode 100755 index 9af96868..a64ea0bf --- a/src/open_r1/generate.py +++ b/src/open_r1/generate.py @@ -13,13 +13,12 @@ # limitations under the License. from typing import Optional - +from typing import Union from distilabel.llms import OpenAILLM from distilabel.pipeline import Pipeline from distilabel.steps import StepResources from distilabel.steps.tasks import TextGeneration -from typing import Union -from sglang import RuntimeClient +import sglang as sgl class LLMBackend: @@ -32,25 +31,47 @@ def get_llm( base_url: str = "http://localhost:8000/v1", timeout: int = 900, max_retries: int = 0, - generation_kwargs: dict = None - ) -> Union[OpenAILLM, RuntimeClient]: - if self.backend == "vllm": - return OpenAILLM( - base_url=base_url, - api_key="something", - model=model, - timeout=timeout, - max_retries=max_retries, - generation_kwargs=generation_kwargs, - ) - elif self.backend == "sglang": - return RuntimeClient( - model=model, - api_base=base_url, - timeout=timeout, - max_retries=max_retries, - **generation_kwargs + generation_kwargs: dict = None, + ) -> Union[OpenAILLM, sgl.Engine]: + """Get LLM instance based on backend.""" + if generation_kwargs is None: + generation_kwargs = {} + + if self.backend == "sglang": + # SGLang engine initialization with correct parameters + sglang_kwargs = { + "model_path": model, + "log_level": "error", + "device": "cuda", + "dtype": "float16", + "max_model_len": 4096, # Add reasonable default max length + } + + # Create engine with basic parameters + engine = sgl.Engine(**sglang_kwargs) + + # Set generation config after initialization + max_tokens = generation_kwargs.get("max_new_tokens", 512) + engine.set_generation_params( + max_tokens=max_tokens, + temperature=generation_kwargs.get("temperature", 0.7), + top_p=generation_kwargs.get("top_p", 0.95) ) + + return engine + + elif self.backend == "vllm": + try: + return OpenAILLM( + base_url=base_url, + api_key="dummy-key", # vLLM doesn't require real key + model=model, + timeout=timeout, + max_retries=max_retries, + generation_kwargs=generation_kwargs, + ) + except Exception as e: + raise ConnectionError(f"Failed to connect to vLLM server at {base_url}: {str(e)}") else: raise ValueError(f"Unknown backend: {self.backend}") @@ -61,26 +82,40 @@ def build_distilabel_pipeline( base_url: str = "http://localhost:8000/v1", prompt_column: Optional[str] = None, prompt_template: str = "{{ instruction }}", - temperature: Optional[float] = None, - top_p: Optional[float] = None, - max_new_tokens: int = 8192, + generation_kwargs: Optional[dict] = None, num_generations: int = 1, input_batch_size: int = 64, client_replicas: int = 1, timeout: int = 900, retries: int = 0, + unique_id: Optional[str] = None, ) -> Pipeline: - generation_kwargs = {"max_new_tokens": max_new_tokens} - - if temperature is not None: - generation_kwargs["temperature"] = temperature + """Build a distilabel pipeline for text generation. + + Args: + model (str): Model identifier or path + backend (str, optional): Backend to use ("vllm" or "sglang"). Defaults to "vllm". + base_url (str, optional): Base URL for vLLM server. Defaults to "http://localhost:8000/v1". + prompt_column (str, optional): Column name containing prompts. Defaults to None. + prompt_template (str, optional): Template for formatting prompts. Defaults to "{{ instruction }}". + generation_kwargs (dict, optional): Generation parameters. Defaults to None. + num_generations (int, optional): Number of generations per prompt. Defaults to 1. + input_batch_size (int, optional): Batch size for processing. Defaults to 64. + client_replicas (int, optional): Number of client replicas. Defaults to 1. + timeout (int, optional): Timeout in seconds. Defaults to 900. + retries (int, optional): Number of retries. Defaults to 0. + unique_id (str, optional): Unique identifier for the pipeline. Defaults to None. - if top_p is not None: - generation_kwargs["top_p"] = top_p + Returns: + Pipeline: Configured distilabel pipeline + """ + if generation_kwargs is None: + generation_kwargs = {} llm_backend = LLMBackend(backend=backend) - - with Pipeline().ray() as pipeline: + pipeline_name = f"pipeline_{unique_id}" if unique_id else "pipeline" + + with Pipeline(name=pipeline_name).ray() as pipeline: TextGeneration( llm=llm_backend.get_llm( model, @@ -102,10 +137,9 @@ def build_distilabel_pipeline( if __name__ == "__main__": import argparse - from datasets import load_dataset - parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1") + parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses") parser.add_argument( "--hf-dataset", type=str, @@ -128,6 +162,7 @@ def build_distilabel_pipeline( "--prompt-column", type=str, default="prompt", + help="Column containing prompts", ) parser.add_argument( "--prompt-template", @@ -142,96 +177,95 @@ def build_distilabel_pipeline( help="Model name to use for generation", ) parser.add_argument( - "--vllm-server-url", + "--backend", + type=str, + default="vllm", + choices=["vllm", "sglang"], + help="Backend to use for generation", + ) + parser.add_argument( + "--base-url", type=str, default="http://localhost:8000/v1", - help="URL of the vLLM server", + help="Base URL for the backend server", ) parser.add_argument( "--temperature", type=float, + default=0.7, help="Temperature for generation", ) parser.add_argument( "--top-p", type=float, + default=0.95, help="Top-p value for generation", ) parser.add_argument( "--max-new-tokens", type=int, - default=8192, + default=512, help="Maximum number of new tokens to generate", ) - parser.add_argument( - "--num-generations", - type=int, - default=1, - help="Number of generations per problem", - ) parser.add_argument( "--input-batch-size", type=int, - default=64, - help="Batch size for input processing", + default=32, + help="Batch size for processing inputs", ) parser.add_argument( "--client-replicas", type=int, default=1, - help="Number of client replicas for parallel processing", + help="Number of client replicas", ) parser.add_argument( "--timeout", type=int, - default=600, - help="Request timeout in seconds (default: 600)", + default=900, + help="Timeout in seconds", ) parser.add_argument( "--retries", type=int, default=0, - help="Number of retries for failed requests (default: 0)", + help="Number of retries", ) parser.add_argument( "--hf-output-dataset", type=str, - required=False, - help="HuggingFace repo to push results to", + help="HuggingFace dataset name to push results to", ) parser.add_argument( "--private", action="store_true", - help="Whether to make the output dataset private when pushing to HF Hub", - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=["vllm", "sglang"], - help="Backend to use for generation (vllm or sglang)", + help="Whether to make the output dataset private", ) args = parser.parse_args() - print("\nRunning with arguments:") - for arg, value in vars(args).items(): - print(f" {arg}: {value}") - print() + # Load dataset + dataset = load_dataset( + args.hf_dataset, + args.hf_dataset_config, + split=args.hf_dataset_split, + ) - print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...") - dataset = load_dataset(args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split) - print("Dataset loaded!") + # Prepare generation kwargs + generation_kwargs = { + "temperature": args.temperature, + "top_p": args.top_p, + "max_new_tokens": args.max_new_tokens, + } + # Build and run pipeline pipeline = build_distilabel_pipeline( model=args.model, - base_url=args.vllm_server_url, - prompt_template=args.prompt_template, + backend=args.backend, + base_url=args.base_url, prompt_column=args.prompt_column, - temperature=args.temperature, - top_p=args.top_p, - max_new_tokens=args.max_new_tokens, - num_generations=args.num_generations, + prompt_template=args.prompt_template, + generation_kwargs=generation_kwargs, input_batch_size=args.input_batch_size, client_replicas=args.client_replicas, timeout=args.timeout, @@ -249,4 +283,4 @@ def build_distilabel_pipeline( if args.hf_output_dataset: print(f"Pushing resulting dataset to '{args.hf_output_dataset}'...") distiset.push_to_hub(args.hf_output_dataset, private=args.private) - print("Dataset pushed!") + print("Dataset pushed!") \ No newline at end of file diff --git a/src/open_r1/pipeline.py b/src/open_r1/pipeline.py new file mode 100644 index 00000000..efd8dac6 --- /dev/null +++ b/src/open_r1/pipeline.py @@ -0,0 +1,43 @@ +from datasets import load_dataset +from distilabel.models import vLLM +from distilabel.pipeline import Pipeline +from distilabel.steps.tasks import TextGeneration + + +prompt_template = """\ +You will be given a problem. Please reason step by step, and put your final answer within \boxed{}: +{{ instruction }}""" + +dataset = load_dataset("AI-MO/NuminaMath-TIR", split="train").select(range(10)) + +model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # Exchange with another smol distilled r1 + +with Pipeline( + name="distill-qwen-7b-r1", + description="A pipeline to generate data from a distilled r1 model", +) as pipeline: + + llm = vLLM( + model=model_id, + tokenizer=model_id, + extra_kwargs={ + "tensor_parallel_size": 1, + "max_model_len": 8192, + }, + generation_kwargs={ + "temperature": 0.6, + "max_new_tokens": 8192, + }, + ) + prompt_column = "problem" + text_generation = TextGeneration( + llm=llm, + template=prompt_template, + num_generations=4, + input_mappings={"instruction": prompt_column} if prompt_column is not None else {} + ) + + +if __name__ == "__main__": + distiset = pipeline.run(dataset=dataset) + distiset.push_to_hub(repo_id="JinnP/numina-deepseek-r1-qwen-7b") \ No newline at end of file diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py index d5369fc2..1a88ba6a 100644 --- a/src/open_r1/sft.py +++ b/src/open_r1/sft.py @@ -167,7 +167,7 @@ def main(script_args, training_args, model_args): # Save everything else on main process kwargs = { - "finetuned_from": model_args.model_name_or_path, + # "finetuned_from": model_args.model_name_or_path, # This argument is not supported "dataset": list(script_args.dataset_name), "dataset_tags": list(script_args.dataset_name), "tags": ["open-r1"],