diff --git a/src/ModelPredictiveControl.jl b/src/ModelPredictiveControl.jl
index adefdfef..70c0e360 100644
--- a/src/ModelPredictiveControl.jl
+++ b/src/ModelPredictiveControl.jl
@@ -8,8 +8,10 @@ using RecipesBase
 using ProgressLogging
 
 using DifferentiationInterface: ADTypes.AbstractADType, AutoForwardDiff, AutoSparse
-using DifferentiationInterface: gradient!, jacobian!, prepare_gradient, prepare_jacobian
+using DifferentiationInterface: prepare_gradient, prepare_jacobian, prepare_hessian
+using DifferentiationInterface: gradient!, jacobian!, hessian!
 using DifferentiationInterface: value_and_gradient!, value_and_jacobian!
+using DifferentiationInterface: value_gradient_and_hessian!
 using DifferentiationInterface: Constant, Cache
 using SparseConnectivityTracer: TracerSparsityDetector
 using SparseMatrixColorings: GreedyColoringAlgorithm, sparsity_pattern
diff --git a/src/controller/execute.jl b/src/controller/execute.jl
index a3925111..a72f7d74 100644
--- a/src/controller/execute.jl
+++ b/src/controller/execute.jl
@@ -405,7 +405,7 @@ function optim_objective!(mpc::PredictiveController{NT}) where {NT<:Real}
             MOIU.reset_optimizer(optim)
             JuMP.optimize!(optim)
         else
-            rethrow(err)
+            rethrow()
         end
     end
     if !issolved(optim)
@@ -488,7 +488,7 @@ Call `periodsleep(mpc.estim.model)`.
 periodsleep(mpc::PredictiveController, busywait=false) = periodsleep(mpc.estim.model, busywait)
 
 """
-    setstate!(mpc::PredictiveController, x̂, P̂=nothing) -> mpc
+    setstate!(mpc::PredictiveController, x̂[, P̂]) -> mpc
 
 Call [`setstate!`](@ref) on `mpc.estim` [`StateEstimator`](@ref).
 """
diff --git a/src/controller/nonlinmpc.jl b/src/controller/nonlinmpc.jl
index 0fc28102..fb3d3619 100644
--- a/src/controller/nonlinmpc.jl
+++ b/src/controller/nonlinmpc.jl
@@ -13,8 +13,9 @@ struct NonLinMPC{
     SE<:StateEstimator,
     TM<:TranscriptionMethod,
     JM<:JuMP.GenericModel,
-    GB<:AbstractADType,
-    JB<:AbstractADType, 
+    GB<:Union{Nothing, AbstractADType},
+    HB<:Union{Nothing, AbstractADType},
+    JB<:AbstractADType,
     PT<:Any,
     JEfunc<:Function,
     GCfunc<:Function
@@ -26,6 +27,7 @@ struct NonLinMPC{
     optim::JM
     con::ControllerConstraint{NT, GCfunc}
     gradient::GB
+    hessian ::HB
     jacobian::JB
     Z̃::Vector{NT}
     ŷ::Vector{NT}
@@ -63,13 +65,14 @@ struct NonLinMPC{
     function NonLinMPC{NT}(
         estim::SE, 
         Hp, Hc, M_Hp, N_Hc, L_Hp, Cwt, Ewt, JE::JEfunc, gc!::GCfunc, nc, p::PT, 
-        transcription::TM, optim::JM, gradient::GB, jacobian::JB
+        transcription::TM, optim::JM, gradient::GB, hessian::HB, jacobian::JB, 
     ) where {
             NT<:Real, 
             SE<:StateEstimator, 
             TM<:TranscriptionMethod,
             JM<:JuMP.GenericModel,
-            GB<:AbstractADType,
+            GB<:Union{Nothing, AbstractADType},
+            HB<:Union{Nothing, AbstractADType},
             JB<:AbstractADType,
             PT<:Any,
             JEfunc<:Function, 
@@ -107,9 +110,9 @@ struct NonLinMPC{
         nZ̃ = get_nZ(estim, transcription, Hp, Hc) + nϵ
         Z̃ = zeros(NT, nZ̃)
         buffer = PredictiveControllerBuffer(estim, transcription, Hp, Hc, nϵ)
-        mpc = new{NT, SE, TM, JM, GB, JB, PT, JEfunc, GCfunc}(
+        mpc = new{NT, SE, TM, JM, GB, HB, JB, PT, JEfunc, GCfunc}(
             estim, transcription, optim, con,
-            gradient, jacobian,
+            gradient, hessian, jacobian,
             Z̃, ŷ,
             Hp, Hc, nϵ,
             weights,
@@ -202,10 +205,14 @@ This controller allocates memory at each time step for the optimization.
 - `transcription=SingleShooting()` : a [`TranscriptionMethod`](@ref) for the optimization.
 - `optim=JuMP.Model(Ipopt.Optimizer)` : nonlinear optimizer used in the predictive
    controller, provided as a [`JuMP.Model`](@extref) object (default to [`Ipopt`](https://github.com/jump-dev/Ipopt.jl) optimizer).
-- `gradient=AutoForwardDiff()` : an `AbstractADType` backend for the gradient of the objective
-   function, see [`DifferentiationInterface` doc](@extref DifferentiationInterface List).
+- `hessian=nothing` : an `AbstractADType` backend for the Hessian of the objective function 
+   (see [`DifferentiationInterface` doc](@extref DifferentiationInterface List)), or 
+   `nothing` for the LBFGS approximation provided by `optim` (details in Extended Help).
+- `gradient=isnothing(hessian) ? AutoForwardDiff() : nothing` : an `AbstractADType` backend
+   for the gradient of the objective function (see `hessian` for the options), or `nothing`
+   to retrieve lower-order derivatives from `hessian`. 
 - `jacobian=default_jacobian(transcription)` : an `AbstractADType` backend for the Jacobian
-   of the nonlinear constraints, see `gradient` above for the options (default in Extended Help).
+   of the nonlinear constraints (see `hessian` for the options, defaults in Extended Help).
 - additional keyword arguments are passed to [`UnscentedKalmanFilter`](@ref) constructor 
   (or [`SteadyKalmanFilter`](@ref), for [`LinModel`](@ref)).
 
@@ -259,14 +266,17 @@ NonLinMPC controller with a sample time Ts = 10.0 s, Ipopt optimizer, UnscentedK
     exception: if `transcription` is not a [`SingleShooting`](@ref), the `jacobian` argument
     defaults to this [sparse backend](@extref DifferentiationInterface AutoSparse-object):
     ```julia
-    AutoSparse(
+    sparseAD = AutoSparse(
         AutoForwardDiff(); 
         sparsity_detector  = TracerSparsityDetector(), 
         coloring_algorithm = GreedyColoringAlgorithm()
     )
     ```
-    Optimizers generally benefit from exact derivatives like AD. However, the [`NonLinModel`](@ref) 
-    state-space functions must be compatible with this feature. See [`JuMP` documentation](@extref JuMP Common-mistakes-when-writing-a-user-defined-operator)
+    Also, the `hessian` argument defaults to `nothing` meaning the LBFGS approximation of 
+    `optim`. Otherwise, a sparse backend like above is recommended to test a different 
+    `hessian` method. In general, optimizers benefit from exact derivatives like AD. 
+    However, the [`NonLinModel`](@ref) state-space functions must be compatible with this 
+    feature. See [`JuMP` documentation](@extref JuMP Common-mistakes-when-writing-a-user-defined-operator)
     for common mistakes when writing these functions.
 
     Note that if `Cwt≠Inf`, the attribute `nlp_scaling_max_gradient` of `Ipopt` is set to 
@@ -291,7 +301,8 @@ function NonLinMPC(
     p = model.p,
     transcription::TranscriptionMethod = DEFAULT_NONLINMPC_TRANSCRIPTION,
     optim::JuMP.GenericModel = JuMP.Model(DEFAULT_NONLINMPC_OPTIMIZER, add_bridges=false),
-    gradient::AbstractADType = DEFAULT_NONLINMPC_GRADIENT,
+    hessian ::Union{Nothing, AbstractADType} = nothing,
+    gradient::Union{Nothing, AbstractADType} = isnothing(hessian) ? DEFAULT_NONLINMPC_GRADIENT : nothing,
     jacobian::AbstractADType = default_jacobian(transcription),
     kwargs...
 )
@@ -299,7 +310,7 @@ function NonLinMPC(
     return NonLinMPC(
         estim; 
         Hp, Hc, Mwt, Nwt, Lwt, Cwt, Ewt, JE, gc, nc, p, M_Hp, N_Hc, L_Hp, 
-        transcription, optim, gradient, jacobian
+        transcription, optim, gradient, hessian, jacobian
     )
 end
 
@@ -322,7 +333,8 @@ function NonLinMPC(
     p = model.p,
     transcription::TranscriptionMethod = DEFAULT_NONLINMPC_TRANSCRIPTION,
     optim::JuMP.GenericModel = JuMP.Model(DEFAULT_NONLINMPC_OPTIMIZER, add_bridges=false),
-    gradient::AbstractADType = DEFAULT_NONLINMPC_GRADIENT,
+    hessian ::Union{Nothing, AbstractADType} = nothing,
+    gradient::Union{Nothing, AbstractADType} = isnothing(hessian) ? DEFAULT_NONLINMPC_GRADIENT : nothing,
     jacobian::AbstractADType = default_jacobian(transcription),
     kwargs...
 )
@@ -330,7 +342,7 @@ function NonLinMPC(
     return NonLinMPC(
         estim; 
         Hp, Hc, Mwt, Nwt, Lwt, Cwt, Ewt, JE, gc, nc, p, M_Hp, N_Hc, L_Hp, 
-        transcription, optim, gradient, jacobian
+        transcription, optim, gradient, hessian, jacobian
     )
 end
 
@@ -377,7 +389,8 @@ function NonLinMPC(
     p = estim.model.p,
     transcription::TranscriptionMethod = DEFAULT_NONLINMPC_TRANSCRIPTION,
     optim::JuMP.GenericModel = JuMP.Model(DEFAULT_NONLINMPC_OPTIMIZER, add_bridges=false),
-    gradient::AbstractADType = DEFAULT_NONLINMPC_GRADIENT,
+    hessian ::Union{Nothing, AbstractADType} = nothing,
+    gradient::Union{Nothing, AbstractADType} = isnothing(hessian) ? DEFAULT_NONLINMPC_GRADIENT : nothing,
     jacobian::AbstractADType = default_jacobian(transcription),
 ) where {
     NT<:Real, 
@@ -392,7 +405,7 @@ function NonLinMPC(
     gc! = get_mutating_gc(NT, gc)
     return NonLinMPC{NT}(
         estim, Hp, Hc, M_Hp, N_Hc, L_Hp, Cwt, Ewt, JE, gc!, nc, p, 
-        transcription, optim, gradient, jacobian
+        transcription, optim, gradient, hessian, jacobian
     )
 end
 
@@ -540,12 +553,12 @@ function init_optimization!(mpc::NonLinMPC, model::SimModel, optim::JuMP.Generic
             JuMP.set_attribute(optim, "nlp_scaling_max_gradient", 10.0/C)
         end
     end
-    Jfunc, ∇Jfunc!, gfuncs, ∇gfuncs!, geqfuncs, ∇geqfuncs! = get_optim_functions(
-        mpc, optim
-    )
-    @operator(optim, J, nZ̃, Jfunc, ∇Jfunc!)
+    validate_backends(mpc.gradient, mpc.hessian)
+    J_args, g_vec_args, geq_vec_args = get_optim_functions(mpc, optim)
+    #display(J_args)
+    @operator(optim, J, nZ̃, J_args...)
     @objective(optim, Min, J(Z̃var...))
-    init_nonlincon!(mpc, model, transcription, gfuncs, ∇gfuncs!, geqfuncs, ∇geqfuncs!)
+    init_nonlincon!(mpc, model, transcription, g_vec_args, geq_vec_args)
     set_nonlincon!(mpc, model, transcription, optim)
     return nothing
 end
@@ -553,14 +566,15 @@ end
 """
     get_optim_functions(
         mpc::NonLinMPC, optim::JuMP.GenericModel
-    ) -> Jfunc, ∇Jfunc!, gfuncs, ∇gfuncs!, geqfuncs, ∇geqfuncs!
+    ) -> J_args, g_vec_args, geq_vec_args
 
 Return the functions for the nonlinear optimization of `mpc` [`NonLinMPC`](@ref) controller.
-
-Return the nonlinear objective `Jfunc` function, and `∇Jfunc!`, to compute its gradient. 
-Also return vectors with the nonlinear inequality constraint functions `gfuncs`, and 
-`∇gfuncs!`, for the associated gradients. Lastly, also return vectors with the nonlinear 
-equality constraint functions `geqfuncs` and gradients `∇geqfuncs!`.
+    
+Return the tuple `J_args` containing the functions to compute the objective function
+value and its derivatives. Also return the tuple `g_vec_args` containing 2 vectors of
+functions to compute the nonlinear inequality values and associated gradients. Lastly, also
+return `geq_vec_args` containing 2 vectors of functions to compute the nonlinear equality
+values and associated gradients.
 
 This method is really intricate and I'm not proud of it. That's because of 3 elements:
 
@@ -579,7 +593,7 @@ Inspired from: [User-defined operators with vector outputs](@extref JuMP User-de
 function get_optim_functions(mpc::NonLinMPC, ::JuMP.GenericModel{JNT}) where JNT<:Real
     # ----------- common cache for Jfunc, gfuncs and geqfuncs  ----------------------------
     model = mpc.estim.model
-    grad, jac = mpc.gradient, mpc.jacobian
+    grad, hess, jac = mpc.gradient, mpc.hessian, mpc.jacobian
     nu, ny, nx̂, nϵ, nk = model.nu, model.ny, mpc.estim.nx̂, mpc.nϵ, model.nk
     Hp, Hc = mpc.Hp, mpc.Hc
     ng, nc, neq = length(mpc.con.i_g), mpc.con.nc, mpc.con.neq
@@ -601,61 +615,86 @@ function get_optim_functions(mpc::NonLinMPC, ::JuMP.GenericModel{JNT}) where JNT
         update_predictions!(ΔŨ, x̂0end, Ue, Ŷe, U0, Ŷ0, Û0, K0, X̂0, gc, g, geq, mpc, Z̃)
         return obj_nonlinprog!(Ŷ0, U0, mpc, model, Ue, Ŷe, ΔŨ)
     end
-    Z̃_∇J = fill(myNaN, nZ̃)      # NaN to force update_predictions! at first call
-    ∇J_context = (
+    Z̃_J = fill(myNaN, nZ̃)      # NaN to force update_predictions! at first call
+    context_J = (
         Cache(ΔŨ), Cache(x̂0end), Cache(Ue), Cache(Ŷe), Cache(U0), Cache(Ŷ0), 
         Cache(Û0), Cache(K0), Cache(X̂0), 
         Cache(gc), Cache(g), Cache(geq),
     )
-    ∇J_prep = prepare_gradient(Jfunc!, grad, Z̃_∇J, ∇J_context...; strict)
-    ∇J = Vector{JNT}(undef, nZ̃)
-    function update_objective!(J, ∇J, Z̃, Z̃arg)
-        if isdifferent(Z̃arg, Z̃)
-            Z̃ .= Z̃arg
-            J[], _ = value_and_gradient!(Jfunc!, ∇J, ∇J_prep, grad, Z̃_∇J, ∇J_context...)
-        end
-    end    
+    if !isnothing(grad)
+        prep_∇J = prepare_gradient(Jfunc!, grad, Z̃_J, context_J...; strict)
+    else
+        prep_∇J = nothing
+    end
+    if !isnothing(hess)
+        prep_∇²J = prepare_hessian(Jfunc!, hess, Z̃_J, context_J...; strict)
+        display(sparsity_pattern(prep_∇²J))
+    else
+        prep_∇²J = nothing
+    end
+    ∇J  = Vector{JNT}(undef, nZ̃)
+    ∇²J = init_diffmat(JNT, hess, prep_∇²J, nZ̃, nZ̃)
     function Jfunc(Z̃arg::Vararg{T, N}) where {N, T<:Real}
-        update_objective!(J, ∇J, Z̃_∇J, Z̃arg)
+        update_memoized_diff!(
+            Z̃_J, J, ∇J, ∇²J, prep_∇J, prep_∇²J, context_J, grad, hess, Jfunc!, Z̃arg
+        )
         return J[]::T
     end
-    ∇Jfunc! = if nZ̃ == 1        # univariate syntax (see JuMP.@operator doc):
+    ∇Jfunc! = if nZ̃ == 1            # univariate syntax (see JuMP.@operator doc):
         function (Z̃arg)
-            update_objective!(J, ∇J, Z̃_∇J, Z̃arg)
+            update_memoized_diff!(
+                Z̃_J, J, ∇J, ∇²J, prep_∇J, prep_∇²J, context_J, grad, hess, Jfunc!, Z̃arg
+            )
             return ∇J[begin]
         end
-    else                        # multivariate syntax (see JuMP.@operator doc):
+    else                             # multivariate syntax (see JuMP.@operator doc):
         function (∇Jarg::AbstractVector{T}, Z̃arg::Vararg{T, N}) where {N, T<:Real}
-            update_objective!(J, ∇J, Z̃_∇J, Z̃arg)
+            update_memoized_diff!(
+                Z̃_J, J, ∇J, ∇²J, prep_∇J, prep_∇²J, context_J, grad, hess, Jfunc!, Z̃arg
+            )
             return ∇Jarg .= ∇J
         end
     end
+    ∇²Jfunc! = if nZ̃ == 1           # univariate syntax (see JuMP.@operator doc):
+        function (Z̃arg)
+            update_memoized_diff!(
+                Z̃_J, J, ∇J, ∇²J, prep_∇J, prep_∇²J, context_J, grad, hess, Jfunc!, Z̃arg
+            )
+            return ∇²J[begin, begin]
+        end
+    else                             # multivariate syntax (see JuMP.@operator doc):
+        function (∇²Jarg::AbstractMatrix{T}, Z̃arg::Vararg{T, N}) where {N, T<:Real}
+            print("d")
+            update_memoized_diff!(
+                Z̃_J, J, ∇J, ∇²J, prep_∇J, prep_∇²J, context_J, grad, hess, Jfunc!, Z̃arg
+            )
+            for i in 1:N, j in 1:i
+                ∇²Jarg[i, j] = ∇²J[i, j]
+            end
+            return ∇²Jarg
+        end
+    end
+    J_args = isnothing(hess) ? (Jfunc, ∇Jfunc!) : (Jfunc, ∇Jfunc!, ∇²Jfunc!)
     # --------------------- inequality constraint functions -------------------------------
     function gfunc!(g, Z̃, ΔŨ, x̂0end, Ue, Ŷe, U0, Ŷ0, Û0, K0, X̂0, gc, geq)
         update_predictions!(ΔŨ, x̂0end, Ue, Ŷe, U0, Ŷ0, Û0, K0, X̂0, gc, g, geq, mpc, Z̃)
         return g
     end
-    Z̃_∇g = fill(myNaN, nZ̃)      # NaN to force update_predictions! at first call
-    ∇g_context = (
+    Z̃_g = fill(myNaN, nZ̃)      # NaN to force update_predictions! at first call
+    context_g = (
         Cache(ΔŨ), Cache(x̂0end), Cache(Ue), Cache(Ŷe), Cache(U0), Cache(Ŷ0), 
         Cache(Û0), Cache(K0), Cache(X̂0), 
         Cache(gc), Cache(geq),
     )
     # temporarily enable all the inequality constraints for sparsity detection:
     mpc.con.i_g[1:end-nc] .= true
-    ∇g_prep  = prepare_jacobian(gfunc!, g, jac, Z̃_∇g, ∇g_context...; strict)
+    prep_∇g  = prepare_jacobian(gfunc!, g, jac, Z̃_g, context_g...; strict)
     mpc.con.i_g[1:end-nc] .= false
-    ∇g = init_diffmat(JNT, jac, ∇g_prep, nZ̃, ng)
-    function update_con!(g, ∇g, Z̃, Z̃arg)
-        if isdifferent(Z̃arg, Z̃)
-            Z̃ .= Z̃arg
-            value_and_jacobian!(gfunc!, g, ∇g, ∇g_prep, jac, Z̃, ∇g_context...)
-        end
-    end
+    ∇g  = init_diffmat(JNT, jac, prep_∇g, nZ̃, ng)
     gfuncs = Vector{Function}(undef, ng)
     for i in eachindex(gfuncs)
         gfunc_i = function (Z̃arg::Vararg{T, N}) where {N, T<:Real}
-            update_con!(g, ∇g, Z̃_∇g, Z̃arg)
+            update_memoized_diff!(Z̃_g, g, ∇g, prep_∇g, context_g, jac, gfunc!, Z̃arg)
             return g[i]::T
         end
         gfuncs[i] = gfunc_i
@@ -664,40 +703,37 @@ function get_optim_functions(mpc::NonLinMPC, ::JuMP.GenericModel{JNT}) where JNT
     for i in eachindex(∇gfuncs!)
         ∇gfuncs_i! = if nZ̃ == 1     # univariate syntax (see JuMP.@operator doc):
             function (Z̃arg::T) where T<:Real
-                update_con!(g, ∇g, Z̃_∇g, Z̃arg)
+                update_memoized_diff!(Z̃_g, g, ∇g, prep_∇g, context_g, jac, gfunc!, Z̃arg)
                 return ∇g[i, begin]
             end
-        else                        # multivariate syntax (see JuMP.@operator doc):
+        else                         # multivariate syntax (see JuMP.@operator doc):
             function (∇g_i, Z̃arg::Vararg{T, N}) where {N, T<:Real}
-                update_con!(g, ∇g, Z̃_∇g, Z̃arg)
+                update_memoized_diff!(Z̃_g, g, ∇g, prep_∇g, context_g, jac, gfunc!, Z̃arg)
                 return ∇g_i .= @views ∇g[i, :] 
             end
         end
         ∇gfuncs![i] = ∇gfuncs_i!
     end
+    g_vec_args = (gfuncs, ∇gfuncs!)
     # --------------------- equality constraint functions ---------------------------------
     function geqfunc!(geq, Z̃, ΔŨ, x̂0end, Ue, Ŷe, U0, Ŷ0, Û0, K0, X̂0, gc, g) 
         update_predictions!(ΔŨ, x̂0end, Ue, Ŷe, U0, Ŷ0, Û0, K0, X̂0, gc, g, geq, mpc, Z̃)
         return geq
     end
-    Z̃_∇geq = fill(myNaN, nZ̃)    # NaN to force update_predictions! at first call
-    ∇geq_context = (
+    Z̃_geq = fill(myNaN, nZ̃)    # NaN to force update_predictions! at first call
+    context_geq = (
         Cache(ΔŨ), Cache(x̂0end), Cache(Ue), Cache(Ŷe), Cache(U0), Cache(Ŷ0),
         Cache(Û0), Cache(K0),   Cache(X̂0),
         Cache(gc), Cache(g)
     )
-    ∇geq_prep = prepare_jacobian(geqfunc!, geq, jac, Z̃_∇geq, ∇geq_context...; strict)
-    ∇geq = init_diffmat(JNT, jac, ∇geq_prep, nZ̃, neq)
-    function update_con_eq!(geq, ∇geq, Z̃, Z̃arg)
-        if isdifferent(Z̃arg, Z̃)
-            Z̃ .= Z̃arg
-            value_and_jacobian!(geqfunc!, geq, ∇geq, ∇geq_prep, jac, Z̃, ∇geq_context...)
-        end
-    end
+    prep_∇geq = prepare_jacobian(geqfunc!, geq, jac, Z̃_geq, context_geq...; strict)
+    ∇geq = init_diffmat(JNT, jac, prep_∇geq, nZ̃, neq)
     geqfuncs = Vector{Function}(undef, neq)
     for i in eachindex(geqfuncs)
         geqfunc_i = function (Z̃arg::Vararg{T, N}) where {N, T<:Real}
-            update_con_eq!(geq, ∇geq, Z̃_∇geq, Z̃arg)
+            update_memoized_diff!(
+                Z̃_geq, geq, ∇geq, prep_∇geq, context_geq, jac, geqfunc!, Z̃arg
+            )
             return geq[i]::T
         end
         geqfuncs[i] = geqfunc_i          
@@ -708,12 +744,15 @@ function get_optim_functions(mpc::NonLinMPC, ::JuMP.GenericModel{JNT}) where JNT
         # constraints imply MultipleShooting, thus input increment ΔU and state X̂0 in Z̃:
         ∇geqfuncs_i! = 
             function (∇geq_i, Z̃arg::Vararg{T, N}) where {N, T<:Real}
-                update_con_eq!(geq, ∇geq, Z̃_∇geq, Z̃arg)
+                update_memoized_diff!(
+                    Z̃_geq, geq, ∇geq, prep_∇geq, context_geq, jac, geqfunc!, Z̃arg
+                )
                 return ∇geq_i .= @views ∇geq[i, :]
             end
         ∇geqfuncs![i] = ∇geqfuncs_i!
     end
-    return Jfunc, ∇Jfunc!, gfuncs, ∇gfuncs!, geqfuncs, ∇geqfuncs!
+    geq_vec_args = (geqfuncs, ∇geqfuncs!)
+    return J_args, g_vec_args, geq_vec_args
 end
 
 """
diff --git a/src/controller/transcription.jl b/src/controller/transcription.jl
index b737092f..6c0b69d2 100644
--- a/src/controller/transcription.jl
+++ b/src/controller/transcription.jl
@@ -604,21 +604,18 @@ end
 
 """
     init_nonlincon!(
-        mpc::PredictiveController, model::LinModel, transcription::TranscriptionMethod, 
-        gfuncs  , ∇gfuncs!,   
-        geqfuncs, ∇geqfuncs!
-    )
+        mpc::PredictiveController, ::LinModel, ::TranscriptionMethod, g_vec_args, geq_vec_args
+    ) -> nothing
 
 Init nonlinear constraints for [`LinModel`](@ref) for all [`TranscriptionMethod`](@ref).
 
 The only nonlinear constraints are the custom inequality constraints `gc`.
 """
 function init_nonlincon!(
-    mpc::PredictiveController, ::LinModel, ::TranscriptionMethod,
-    gfuncs, ∇gfuncs!, 
-    _ , _    
+    mpc::PredictiveController, ::LinModel, ::TranscriptionMethod, g_vec_args, _   
 ) 
     optim, con = mpc.optim, mpc.con
+    gfuncs, ∇gfuncs! = g_vec_args
     nZ̃ = length(mpc.Z̃)
     if length(con.i_g) ≠ 0
         i_base = 0
@@ -634,10 +631,8 @@ end
 
 """
     init_nonlincon!(
-        mpc::PredictiveController, model::NonLinModel, transcription::MultipleShooting, 
-        gfuncs,   ∇gfuncs!,
-        geqfuncs, ∇geqfuncs!
-    )
+        mpc::PredictiveController, ::NonLinModel, ::MultipleShooting, g_vec_args, geq_vec_args
+    ) -> nothing
     
 Init nonlinear constraints for [`NonLinModel`](@ref) and [`MultipleShooting`](@ref).
 
@@ -645,11 +640,11 @@ The nonlinear constraints are the output prediction `Ŷ` bounds, the custom ine
 constraints `gc` and all the nonlinear equality constraints `geq`.
 """
 function init_nonlincon!(
-    mpc::PredictiveController, ::NonLinModel, ::MultipleShooting, 
-    gfuncs,     ∇gfuncs!,
-    geqfuncs,   ∇geqfuncs!
+    mpc::PredictiveController, ::NonLinModel, ::MultipleShooting, g_vec_args, geq_vec_args
 ) 
     optim, con = mpc.optim, mpc.con
+    gfuncs  , ∇gfuncs!   = g_vec_args
+    geqfuncs, ∇geqfuncs! = geq_vec_args
     ny, nx̂, Hp, nZ̃ = mpc.estim.model.ny, mpc.estim.nx̂, mpc.Hp, length(mpc.Z̃)
     # --- nonlinear inequality constraints ---
     if length(con.i_g) ≠ 0
@@ -691,10 +686,8 @@ end
 
 """
     init_nonlincon!(
-        mpc::PredictiveController, model::NonLinModel, ::SingleShooting, 
-        gfuncs,   ∇gfuncs!,
-        geqfuncs, ∇geqfuncs!
-    )
+        mpc::PredictiveController, ::NonLinModel, ::SingleShooting, g_vec_args, geq_vec_args
+    ) -> nothing
 
 Init nonlinear constraints for [`NonLinModel`](@ref) and [`SingleShooting`](@ref).
 
@@ -702,9 +695,10 @@ The nonlinear constraints are the custom inequality constraints `gc`, the output
 prediction `Ŷ` bounds and the terminal state `x̂end` bounds.
 """
 function init_nonlincon!(
-    mpc::PredictiveController, ::NonLinModel, ::SingleShooting, gfuncs, ∇gfuncs!, _ , _
+    mpc::PredictiveController, ::NonLinModel, ::SingleShooting, g_vec_args, _
 )
     optim, con = mpc.optim, mpc.con
+    gfuncs, ∇gfuncs! = g_vec_args
     ny, nx̂, Hp, nZ̃ = mpc.estim.model.ny, mpc.estim.nx̂, mpc.Hp, length(mpc.Z̃)
     if length(con.i_g) ≠ 0
         i_base = 0
diff --git a/src/estimator/execute.jl b/src/estimator/execute.jl
index 23408995..98b63c9e 100644
--- a/src/estimator/execute.jl
+++ b/src/estimator/execute.jl
@@ -330,7 +330,7 @@ function validate_args(estim::StateEstimator, ym, d, u=nothing)
 end
 
 """
-    setstate!(estim::StateEstimator, x̂, P̂=nothing) -> estim
+    setstate!(estim::StateEstimator, x̂[, P̂]) -> estim
 
 Set `estim.x̂0` to `x̂ - estim.x̂op` from the argument `x̂`, and `estim.P̂` to `P̂` if applicable. 
 
diff --git a/src/estimator/mhe/construct.jl b/src/estimator/mhe/construct.jl
index 4c6ebb71..beba35cb 100644
--- a/src/estimator/mhe/construct.jl
+++ b/src/estimator/mhe/construct.jl
@@ -250,33 +250,37 @@ transcription for now.
 - `model::SimModel` : (deterministic) model for the estimations.
 - `He=nothing` : estimation horizon ``H_e``, must be specified.
 - `i_ym=1:model.ny` : `model` output indices that are measured ``\mathbf{y^m}``, the rest 
-    are unmeasured ``\mathbf{y^u}``.
+   are unmeasured ``\mathbf{y^u}``.
 - `σP_0=fill(1/model.nx,model.nx)` or *`sigmaP_0`* : main diagonal of the initial estimate
-    covariance ``\mathbf{P}(0)``, specified as a standard deviation vector.
+   covariance ``\mathbf{P}(0)``, specified as a standard deviation vector.
 - `σQ=fill(1/model.nx,model.nx)` or *`sigmaQ`* : main diagonal of the process noise
-    covariance ``\mathbf{Q}`` of `model`, specified as a standard deviation vector.
+   covariance ``\mathbf{Q}`` of `model`, specified as a standard deviation vector.
 - `σR=fill(1,length(i_ym))` or *`sigmaR`* : main diagonal of the sensor noise covariance
-    ``\mathbf{R}`` of `model` measured outputs, specified as a standard deviation vector.
+   ``\mathbf{R}`` of `model` measured outputs, specified as a standard deviation vector.
 - `nint_u=0`: integrator quantity for the stochastic model of the unmeasured disturbances at
-    the manipulated inputs (vector), use `nint_u=0` for no integrator (see Extended Help).
+   the manipulated inputs (vector), use `nint_u=0` for no integrator (see Extended Help).
 - `nint_ym=default_nint(model,i_ym,nint_u)` : same than `nint_u` but for the unmeasured 
-    disturbances at the measured outputs, use `nint_ym=0` for no integrator (see Extended Help).
+   disturbances at the measured outputs, use `nint_ym=0` for no integrator (see Extended Help).
 - `σQint_u=fill(1,sum(nint_u))` or *`sigmaQint_u`* : same than `σQ` but for the unmeasured
-    disturbances at manipulated inputs ``\mathbf{Q_{int_u}}`` (composed of integrators).
+   disturbances at manipulated inputs ``\mathbf{Q_{int_u}}`` (composed of integrators).
 - `σPint_u_0=fill(1,sum(nint_u))` or *`sigmaPint_u_0`* : same than `σP_0` but for the unmeasured
-    disturbances at manipulated inputs ``\mathbf{P_{int_u}}(0)`` (composed of integrators).
+   disturbances at manipulated inputs ``\mathbf{P_{int_u}}(0)`` (composed of integrators).
 - `σQint_ym=fill(1,sum(nint_ym))` or *`sigmaQint_u`* : same than `σQ` for the unmeasured
-    disturbances at measured outputs ``\mathbf{Q_{int_{ym}}}`` (composed of integrators).
+   disturbances at measured outputs ``\mathbf{Q_{int_{ym}}}`` (composed of integrators).
 - `σPint_ym_0=fill(1,sum(nint_ym))` or *`sigmaPint_ym_0`* : same than `σP_0` but for the unmeasured
-    disturbances at measured outputs ``\mathbf{P_{int_{ym}}}(0)`` (composed of integrators).
+   disturbances at measured outputs ``\mathbf{P_{int_{ym}}}(0)`` (composed of integrators).
 - `Cwt=Inf` : slack variable weight ``C``, default to `Inf` meaning hard constraints only.
 - `optim=default_optim_mhe(model)` : a [`JuMP.Model`](@extref) object with a quadratic or
    nonlinear optimizer for solving (default to [`Ipopt`](https://github.com/jump-dev/Ipopt.jl),
    or [`OSQP`](https://osqp.org/docs/parsers/jump.html) if `model` is a [`LinModel`](@ref)).
-- `gradient=AutoForwardDiff()` : an `AbstractADType` backend for the gradient of the objective
-   function when `model` is not a [`LinModel`](@ref), see [`DifferentiationInterface` doc](@extref DifferentiationInterface List).
+- `hessian=nothing` : an `AbstractADType` backend for the Hessian of the objective function 
+   when `model` is not a [`LinModel`](@ref) (see [`DifferentiationInterface` doc](@extref DifferentiationInterface List)),
+   or `nothing` for the LBFGS approximation provided by `optim` (details in Extended Help).
+- `gradient=isnothing(hessian) ? AutoForwardDiff() : nothing` : an `AbstractADType` backend
+   for the gradient of the objective function when `model` is not a [`LinModel`](@ref) (see
+   `hessian` for the options), or `nothing` to retrieve lower-order derivatives from `hessian`. 
 - `jacobian=AutoForwardDiff()` : an `AbstractADType` backend for the Jacobian of the
-   constraints when `model` is not a [`LinModel`](@ref), see `gradient` above for the options.
+   constraints when `model` is not a [`LinModel`](@ref) (see `hessian` for the options).
 - `direct=true`: construct with a direct transmission from ``\mathbf{y^m}`` (a.k.a. current
    estimator, in opposition to the delayed/predictor form).
 
@@ -359,7 +363,9 @@ MovingHorizonEstimator estimator with a sample time Ts = 10.0 s, Ipopt optimizer
       default, a [`KalmanFilter`](@ref) estimates the arrival covariance (customizable).
     - Else, a nonlinear program with dense [`ForwardDiff`](@extref ForwardDiff) automatic
       differentiation (AD) compute the objective and constraint derivatives by default 
-      (customizable). Optimizers generally benefit from exact derivatives like AD. However, 
+      (customizable). The `hessian` argument defaults the LBFGS approximation of `optim`,
+      but see [`NonLinMPC`](@ref) extended help for a sparse backend that would be otherwise
+      recommended. Optimizers generally benefit from exact derivatives like AD. However, 
       the `f` and `h` functions must be compatible with this feature. See the 
       [`JuMP` documentation](@extref JuMP Common-mistakes-when-writing-a-user-defined-operator)
       for common mistakes when writing these functions. Also, an [`UnscentedKalmanFilter`](@ref)
diff --git a/src/general.jl b/src/general.jl
index 90a411db..0bb2fc28 100644
--- a/src/general.jl
+++ b/src/general.jl
@@ -51,14 +51,97 @@ function limit_solve_time(optim::GenericModel, Ts)
         if isa(err, MOI.UnsupportedAttribute{MOI.TimeLimitSec})
             @warn "Solving time limit is not supported by the optimizer."
         else
-            rethrow(err)
+            rethrow()
         end
     end
 end
 
+"Verify that provided 1st and 2nd order differentiation backends are possible and efficient."
+validate_backends(firstOrder::AbstractADType, secondOrder::Nothing) = nothing
+validate_backends(firstOrder::Nothing, secondOrder::AbstractADType) = nothing
+function validate_backends(firstOrder::AbstractADType, secondOrder::AbstractADType) 
+    @warn(
+        """
+        Two AbstractADType backends were provided for the 1st and 2nd order differentiations,
+        meaning that 1st order derivatives will be computed twice. Use nothing for the 1st
+        order backend to retrieve results from the hessian backend, which is more efficient.
+        """
+    )
+    return nothing 
+end
+function validate_backends(firstOrder::Nothing, secondOrder::Nothing)
+    throw(ArgumentError("1st and 2nd order differentiation backends cannot be both nothing."))
+end
+
 "Init a differentiation result matrix as dense or sparse matrix, as required by `backend`."
 init_diffmat(T, backend::AbstractADType, _  , nx , ny) = Matrix{T}(undef, ny, nx)
 init_diffmat(T, backend::AutoSparse    ,prep , _ , _ ) = similar(sparsity_pattern(prep), T)
+init_diffmat(T, backend::Nothing       , _  , nx , ny) = Matrix{T}(undef, ny, nx)
+
+"""
+    update_memoized_diff!(
+        x, y, ∇f, ∇²f, prep_∇f, prep_∇²f, context,
+        gradient::AbstractADType, hessian::Nothing, f!, xarg
+    ) -> nothing
+
+Update `f!` value `y` and and its gradient `∇f` in-place if `x ≠ xarg`.
+
+The method mutates all the arguments before `gradient`. This function is used for the
+memoization of the `f!` function derivatives, to avoid redundant computations with the
+splatting syntax of `JuMP.@operator`.
+"""
+function update_memoized_diff!(
+    x, y, ∇f, _ , prep_∇f, _ , context,
+    gradient::AbstractADType, hessian::Nothing, f!::F, xarg
+) where F <: Function
+    if isdifferent(xarg, x)
+        x .= xarg # more efficient than individual f! and gradient! calls:
+        y[], _ = value_and_gradient!(f!, ∇f, prep_∇f, gradient, x, context...)
+    end
+    return nothing
+end
+
+"Also update the Hessian `∇²f` if `hessian isa AbstractADType` and `isnothing(gradient)`."
+function update_memoized_diff!(
+    x, y, ∇f, ∇²f, _ , prep_∇²f, context,
+    gradient::Nothing, hessian::AbstractADType, f!::F, xarg
+) where F <: Function
+    if isdifferent(xarg, x)
+        x .= xarg # more efficient than individual f!, gradient! and hessian! calls:
+        y[], _ = value_gradient_and_hessian!(f!, ∇f, ∇²f, prep_∇²f, hessian, x, context...)
+    end
+    return nothing
+end 
+
+"Update `∇f` and `∇²f` individually if both backends are `AbstractADType`."
+function update_memoized_diff!(
+    x, y, ∇f, ∇²f, prep_∇f, prep_∇²f, context,
+    gradient::AbstractADType, hessian::AbstractADType, f!::F, xarg
+) where F <: Function
+    if isdifferent(xarg, x)
+        x .= xarg # inefficient, as warned by validate_backends(), but still possible:
+        hessian!(f!, ∇²f, prep_∇²f, hessian, x, context...)
+        y[], _ = value_and_gradient!(f!, ∇f, prep_∇f, gradient, x, context...)
+    end
+    return nothing
+end
+
+"""
+    update_memoized_diff!(x, y, ∇f, prep_∇f, context, jacobian::AbstractADType, f!, xarg) 
+
+Update `f!` value `y` (vector) and and its jacobian `∇f` in-place if `x ≠ xarg`.
+
+This method mutates all the arguments before `jacobian`.
+"""
+function update_memoized_diff!(
+    x, y, ∇f, prep_∇f, context, jacobian::AbstractADType, f!::F, xarg
+) where F <: Function
+    if isdifferent(xarg, x)
+        x .= xarg # more efficient than individual f! and jacobian! calls:
+        value_and_jacobian!(f!, y, ∇f, prep_∇f, jacobian, x, context...)
+    end
+    return nothing
+end
 
 "Verify that x and y elements are different using `!==`."
 isdifferent(x, y) = any(xi !== yi for (xi, yi) in zip(x, y))
diff --git a/test/2_test_state_estim.jl b/test/2_test_state_estim.jl
index c1cb8e76..082e0c45 100644
--- a/test/2_test_state_estim.jl
+++ b/test/2_test_state_estim.jl
@@ -1364,7 +1364,7 @@ end
     X̂_mhe = zeros(4, 6)
     X̂_kf  = zeros(4, 6)
     for i in 1:6
-        y = [50,31] #+ randn(2)
+        y = [50,31] + randn(2)
         x̂_mhe = preparestate!(mhe, y, [25])
         x̂_kf  = preparestate!(kf,  y, [25])
         X̂_mhe[:,i] = x̂_mhe