Removing handling max bs from client, handling in the REST API (#347)

fdrose · web-flow · commit b1f70c02c5a6 · 2025-08-01T13:27:50.000+02:00
* Remove handling max bs from client, handling in the REST API

* Remove init with zeros for batch_size limits

* Remove warning
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.5.21"
+version = "1.5.22"
 authors = ["Together AI <support@together.ai>"]
 description = "Python client for Together's Cloud Platform!"
 readme = "README.md"
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -304,13 +304,8 @@ def create(
             raise click.BadParameter(
                 f"LoRA fine-tuning is not supported for the model `{model}`"
             )
-        if training_method == "dpo":
-            default_batch_size = model_limits.lora_training.max_batch_size_dpo
-        else:
-            default_batch_size = model_limits.lora_training.max_batch_size
         default_values = {
             "lora_r": model_limits.lora_training.max_rank,
-            "batch_size": default_batch_size,
             "learning_rate": 1e-3,
         }
 
@@ -335,15 +330,6 @@ def create(
                     f"Please change the job type with --lora or remove `{param}` from the arguments"
                 )
 
-        batch_size_source = ctx.get_parameter_source("batch_size")  # type: ignore[attr-defined]
-        if batch_size_source == ParameterSource.DEFAULT:
-            if training_method == "dpo":
-                training_args["batch_size"] = (
-                    model_limits.full_training.max_batch_size_dpo
-                )
-            else:
-                training_args["batch_size"] = model_limits.full_training.max_batch_size
-
     if n_evals <= 0 and validation_file:
         log_warn(
             "Warning: You have specified a validation file but the number of evaluation loops is set to 0. No evaluations will be performed."
diff --git a/src/together/legacy/finetune.py b/src/together/legacy/finetune.py
@@ -16,7 +16,7 @@ def create(
         model: str,
         n_epochs: int = 1,
         n_checkpoints: int | None = 1,
-        batch_size: int | None = 32,
+        batch_size: int | Literal["max"] = "max",
         learning_rate: float = 0.00001,
         suffix: (
             str | None
@@ -43,7 +43,7 @@ def create(
             model=model,
             n_epochs=n_epochs,
             n_checkpoints=n_checkpoints,
-            batch_size=batch_size if isinstance(batch_size, int) else "max",
+            batch_size=batch_size,
             learning_rate=learning_rate,
             suffix=suffix,
             wandb_api_key=wandb_api_key,
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -89,18 +89,10 @@ def create_finetune_request(
 
     model_or_checkpoint = model or from_checkpoint
 
-    if batch_size == "max":
-        log_warn_once(
-            "Starting from together>=1.3.0, "
-            "the default batch size is set to the maximum allowed value for each model."
-        )
     if warmup_ratio is None:
         warmup_ratio = 0.0
 
     training_type: TrainingType = FullTrainingType()
-    max_batch_size: int = 0
-    max_batch_size_dpo: int = 0
-    min_batch_size: int = 0
     if lora:
         if model_limits.lora_training is None:
             raise ValueError(
@@ -133,28 +125,23 @@ def create_finetune_request(
         min_batch_size = model_limits.full_training.min_batch_size
         max_batch_size_dpo = model_limits.full_training.max_batch_size_dpo
 
-    if batch_size == "max":
-        if training_method == "dpo":
-            batch_size = max_batch_size_dpo
-        else:
-            batch_size = max_batch_size
+    if batch_size != "max":
+        if training_method == "sft":
+            if batch_size > max_batch_size:
+                raise ValueError(
+                    f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}."
+                )
+        elif training_method == "dpo":
+            if batch_size > max_batch_size_dpo:
+                raise ValueError(
+                    f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size_dpo}."
+                )
 
-    if training_method == "sft":
-        if batch_size > max_batch_size:
-            raise ValueError(
-                f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}."
-            )
-    elif training_method == "dpo":
-        if batch_size > max_batch_size_dpo:
+        if batch_size < min_batch_size:
             raise ValueError(
-                f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size_dpo}."
+                f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}."
             )
 
-    if batch_size < min_batch_size:
-        raise ValueError(
-            f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}."
-        )
-
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError(f"Warmup ratio should be between 0 and 1 (got {warmup_ratio})")
 
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -195,7 +195,7 @@ class FinetuneRequest(BaseModel):
     # number of evaluation loops to run
     n_evals: int | None = None
     # training batch size
-    batch_size: int | None = None
+    batch_size: int | Literal["max"] | None = None
     # up to 40 character suffix for output model name
     suffix: str | None = None
     # weights & biases api key
diff --git a/tests/unit/test_finetune_resources.py b/tests/unit/test_finetune_resources.py
@@ -44,7 +44,7 @@ def test_simple_request():
     assert request.n_epochs > 0
     assert request.warmup_ratio == 0.0
     assert request.training_type.type == "Full"
-    assert request.batch_size == _MODEL_LIMITS.full_training.max_batch_size
+    assert request.batch_size == "max"
 
 
 def test_validation_file():
@@ -82,7 +82,7 @@ def test_lora_request():
     assert request.training_type.lora_alpha == _MODEL_LIMITS.lora_training.max_rank * 2
     assert request.training_type.lora_dropout == 0.0
     assert request.training_type.lora_trainable_modules == "all-linear"
-    assert request.batch_size == _MODEL_LIMITS.lora_training.max_batch_size
+    assert request.batch_size == "max"
 
 
 @pytest.mark.parametrize("lora_dropout", [-1, 0, 0.5, 1.0, 10.0])
@@ -124,7 +124,7 @@ def test_dpo_request_lora():
     assert request.training_type.lora_alpha == _MODEL_LIMITS.lora_training.max_rank * 2
     assert request.training_type.lora_dropout == 0.0
     assert request.training_type.lora_trainable_modules == "all-linear"
-    assert request.batch_size == _MODEL_LIMITS.lora_training.max_batch_size_dpo
+    assert request.batch_size == "max"
 
 
 def test_dpo_request():
@@ -137,7 +137,7 @@ def test_dpo_request():
     )
 
     assert request.training_type.type == "Full"
-    assert request.batch_size == _MODEL_LIMITS.full_training.max_batch_size_dpo
+    assert request.batch_size == "max"
 
 
 def test_from_checkpoint_request():