diff --git a/xopt/numerical_optimizer.py b/xopt/numerical_optimizer.py
index 1ad116a69..3bd1fb9a7 100644
--- a/xopt/numerical_optimizer.py
+++ b/xopt/numerical_optimizer.py
@@ -1,10 +1,11 @@
+import warnings
 from abc import ABC, abstractmethod
 from typing import Optional
 
 import torch
 from botorch.acquisition import AcquisitionFunction
 from botorch.optim import optimize_acqf
-from pydantic import ConfigDict, Field, PositiveFloat, PositiveInt
+from pydantic import ConfigDict, Field, PositiveFloat, PositiveInt, field_validator
 from torch import Tensor
 
 from xopt.pydantic import XoptBaseModel
@@ -14,13 +15,6 @@ class NumericalOptimizer(XoptBaseModel, ABC):
     """
     Base class for numerical optimizers.
 
-    Attributes
-    ----------
-    name : str
-        The name of the optimizer. Default is "base_numerical_optimizer".
-    model_config : ConfigDict
-        Configuration dictionary with extra fields forbidden.
-
     Methods
     -------
     optimize(function, bounds, n_candidates=1, **kwargs)
@@ -42,37 +36,38 @@ class LBFGSOptimizer(NumericalOptimizer):
     Attributes
     ----------
     n_restarts : PositiveInt
-        Number of restarts during acquisition function optimization, default is 20.
+        Number of restarts (independent initial conditions) for optimization, default is 20.
+    n_raw_samples : PositiveInt
+        Number of raw samples used to pick the initial `n_restarts` points, default is 128.
     max_iter : PositiveInt
         Maximum number of iterations for the optimizer, default is 2000.
     max_time : Optional[PositiveFloat]
         Maximum time allowed for optimization, default is None (no time limit).
+    max_ls : Optional[PositiveInt]
+        Maximum number of line search steps, default is None (use scipy defaults).
+    with_grad : bool
+        Whether to use autograd (True, default) or finite difference (False) for gradient computation.
+    ftol: Optional[PositiveFloat]
+        Convergence tolerance on the acquisition function value. If None, use scipy defaults.
+    pgtol: Optional[PositiveFloat]
+        Convergence tolerance on the projected gradient. If None, use scipy defaults.
+    sequential : bool
+        Use sequential (True) or joint (False, default) optimization when multiple candidates are requested.
 
     Methods
     -------
     optimize(function, bounds, n_candidates=1, **kwargs)
         Optimize the given acquisition function within the specified bounds.
-
-    Parameters
-    ----------
-    function : callable
-        The acquisition function to be optimized.
-    bounds : Tensor
-        The bounds within which to optimize the acquisition function. Must have shape [2, ndim].
-    n_candidates : int, optional
-        Number of candidates to return, default is 1.
-    **kwargs : dict
-        Additional keyword arguments to pass to the optimizer.
-
-    Returns
-    -------
-    candidates : Tensor
-        The optimized candidates.
     """
 
     name: str = Field("LBFGS", frozen=True)
     n_restarts: PositiveInt = Field(
-        20, description="number of restarts during acquisition function optimization"
+        20,
+        description="number of restarts (independent initial conditions) for optimization",
+    )
+    n_raw_samples: Optional[PositiveInt] = Field(
+        128,
+        description="number of raw samples - `n_restarts` best ones are selected from this set",
     )
     max_iter: PositiveInt = Field(
         2000, description="maximum number of optimization steps"
@@ -80,9 +75,50 @@ class LBFGSOptimizer(NumericalOptimizer):
     max_time: Optional[PositiveFloat] = Field(
         5.0, description="maximum time for optimization in seconds"
     )
-
+    max_ls: Optional[PositiveInt] = Field(
+        None,
+        description="maximum number of line search steps. If None, use scipy defaults.",
+    )
+    with_grad: bool = Field(
+        True,
+        description="Use autograd (true) or finite difference (false) for gradient computation."
+        " Use autograd unless it is impossible (e.g. non-differentiable elements).",
+    )
+    eps: Optional[PositiveFloat] = Field(
+        None,
+        description="Step size used for finite difference if with_grad is False."
+        " If None, use scipy default of 1e-08.",
+    )
+    ftol: Optional[PositiveFloat] = Field(
+        None,
+        description="Convergence tolerance on the acquisition function value."
+        " If None, use scipy default of ftol = 1e7 * numpy.finfo(float).eps = 2.22e-09.",
+    )
+    pgtol: Optional[PositiveFloat] = Field(
+        None,
+        description="Convergence tolerance on the projected gradient."
+        " If None, use scipy default of 1e-05.",
+    )
+    sequential: bool = Field(
+        False,
+        description="Use sequential (true) or joint (false) optimization when q > 1."
+        " In practice, joint optimization is faster but requires more GPU memory, "
+        " and sequential optimization is slightly more robust (especially with high q)",
+    )
     model_config = ConfigDict(validate_assignment=True)
 
+    @field_validator("n_raw_samples", mode="after")
+    def validate_n_raw_samples(cls, v, info):
+        if v is None:
+            return 128
+        n_restarts = info.data.get("n_restarts", 20)
+        if v < n_restarts:
+            warnings.warn(
+                f"n_raw_samples should be >= than n_restarts, setting to n_restarts={n_restarts}"
+            )
+            v = n_restarts
+        return v
+
     def optimize(self, function, bounds, n_candidates=1, **kwargs):
         """
         Optimize the given acquisition function within the specified bounds.
@@ -108,21 +144,32 @@ def optimize(self, function, bounds, n_candidates=1, **kwargs):
         if len(bounds) != 2:
             raise ValueError("bounds must have the shape [2, ndim]")
 
-        # emperical testing showed that the max time is overrun slightly on the botorch side
+        # empirical testing showed that the max time is overrun slightly on the botorch side
         # fix by slightly reducing the max time passed to this function
         if self.max_time is not None:
             max_time = self.max_time * 0.8 - 0.01
         else:
             max_time = None
 
+        options = {
+            "maxiter": self.max_iter,
+            "with_grad": self.with_grad,
+        }
+        if self.ftol is not None:
+            options["ftol"] = self.ftol
+        if self.pgtol is not None:
+            options["pgtol"] = self.pgtol
+        if self.max_ls is not None:
+            options["max_ls"] = self.max_ls
+
         candidates, _ = optimize_acqf(
             acq_function=function,
             bounds=bounds,
             q=n_candidates,
-            raw_samples=self.n_restarts,
+            raw_samples=self.n_raw_samples,
             num_restarts=self.n_restarts,
             timeout_sec=max_time,
-            options={"maxiter": self.max_iter},
+            options=options,
             **kwargs,
         )
         return candidates
diff --git a/xopt/tests/test_numerical_optimizer.py b/xopt/tests/test_numerical_optimizer.py
index 9166fda55..9973b419f 100644
--- a/xopt/tests/test_numerical_optimizer.py
+++ b/xopt/tests/test_numerical_optimizer.py
@@ -18,7 +18,8 @@ def f(X):
     # q_reduction converts it into `sample_shape x batch_shape`
     # sample_reduction converts it into `batch_shape`-dim Tensor of acquisition values
     # so, final fake tensor needs to have ndim=1
-    result = torch.amax(X, dim=(1, 2))
+    assert X.ndim == 3, "X should be a 3D tensor with shape [batch_shape, q, d]"
+    result = torch.amax(X, dim=(1, 2)) * 2
     return result
 
 
@@ -29,13 +30,28 @@ def test_base(self):
 
     def test_lbfgs_optimizer(self):
         optimizer = LBFGSOptimizer()
+        optimizer2 = LBFGSOptimizer(with_grad=False, eps=1e-10)
         for ndim in [1, 3]:
             bounds = torch.stack((torch.zeros(ndim), torch.ones(ndim)))
             for ncandidate in [1, 3]:
+                # np.random.seed(42)
+                # torch.manual_seed(42)
                 candidates = optimizer.optimize(
                     function=f, bounds=bounds, n_candidates=ncandidate
                 )
+                # np.random.seed(42)
+                # torch.manual_seed(42)
+                candidates2 = optimizer2.optimize(
+                    function=f, bounds=bounds, n_candidates=ncandidate
+                )
                 assert candidates.shape == torch.Size([ncandidate, ndim])
+                assert candidates2.shape == torch.Size([ncandidate, ndim])
+                f1 = f(candidates.unsqueeze(0))
+                f2 = f(candidates2.unsqueeze(0))
+                if ncandidate == 1:
+                    # for multiple candidates the optimizers don't converge to the same point
+                    assert torch.allclose(candidates, candidates2, rtol=0.0, atol=0.05)
+                    assert torch.allclose(f1, f2, rtol=0.0, atol=0.05)
 
         # test max time
         max_time_optimizer = LBFGSOptimizer(max_time=1.0)