diff --git a/benchmark/MPSKitBenchmarks/MPSKitBenchmarks.jl b/benchmark/MPSKitBenchmarks/MPSKitBenchmarks.jl
index c70a01f70..9546df2ba 100644
--- a/benchmark/MPSKitBenchmarks/MPSKitBenchmarks.jl
+++ b/benchmark/MPSKitBenchmarks/MPSKitBenchmarks.jl
@@ -14,10 +14,12 @@ BenchmarkTools.DEFAULT_PARAMETERS.memory_tolerance = 0.01
 const PARAMS_PATH = joinpath(@__DIR__, "etc", "params.json")
 const SUITE = BenchmarkGroup()
 const MODULES = Dict{String, Symbol}(
-    "derivatives" => :DerivativesBenchmarks
+    "derivatives" => :DerivativesBenchmarks,
+    "timestep" => :TimestepBenchmarks,
 )
 
 include("derivatives/DerivativesBenchmarks.jl")
+include("timestep/TimestepBenchmarks.jl")
 
 load!(id::AbstractString; kwargs...) = load!(SUITE, id; kwargs...)
 
diff --git a/benchmark/MPSKitBenchmarks/timestep/TimestepBenchmarks.jl b/benchmark/MPSKitBenchmarks/timestep/TimestepBenchmarks.jl
new file mode 100644
index 000000000..fdc71afc5
--- /dev/null
+++ b/benchmark/MPSKitBenchmarks/timestep/TimestepBenchmarks.jl
@@ -0,0 +1,26 @@
+module TimestepBenchmarks
+
+using BenchmarkTools
+using MPSKit
+using MPSKitModels
+
+const SUITE = BenchmarkGroup()
+const dt = 0.05
+
+const HAMILTONIANS = (
+    "tfim" => transverse_field_ising(),
+    "heis_xxx" => heisenberg_XXX(),
+)
+
+let g_top = addgroup!(SUITE, "make_time_mpo")
+    for (Hname, H) in HAMILTONIANS
+        g = addgroup!(g_top, Hname)
+        for N in (1, 2, 3)
+            alg = TaylorCluster(; N)
+            g["N=$N"] = @benchmarkable make_time_mpo($H, $dt, $alg)
+        end
+        g["WII"] = @benchmarkable make_time_mpo($H, $dt, $(MPSKit.WII()))
+    end
+end
+
+end # module
diff --git a/src/algorithms/timestep/taylorcluster.jl b/src/algorithms/timestep/taylorcluster.jl
index a24142431..9e5943288 100644
--- a/src/algorithms/timestep/taylorcluster.jl
+++ b/src/algorithms/timestep/taylorcluster.jl
@@ -27,58 +27,101 @@ First order Taylor expansion for a time-evolution MPO.
 """
 const WI = TaylorCluster(; N = 1, extension = false, compression = false)
 
+# For type stability reasons, we add a function barrier here and dispatch on Val(N).
+# `LinearIndices` and `CartesianIndex{N}` are otherwise abstract.
+
 function make_time_mpo(
         H::MPOHamiltonian, dt::Number, alg::TaylorCluster;
         tol = eps(real(scalartype(H))), imaginary_evolution::Bool = false
     )
-    N = alg.N
     τ = imaginary_evolution ? -dt : -1im * dt
+    return _make_time_mpo(H, τ, Val(alg.N), alg.extension, alg.compression; tol, imaginary_evolution)
+end
 
-    # start with H^N
+function _make_time_mpo(
+        H::MPOHamiltonian, τ::Number, ::Val{N},
+        extension::Bool, compression::Bool;
+        tol, imaginary_evolution::Bool
+    ) where {N}
     H_n = H^N
     virtual_sz = map(0:length(H)) do i
         return i == 0 ? size(H[1], 1) : size(H[i], 4)
     end
     linds = map(virtual_sz) do sz
-        return LinearIndices(ntuple(Returns(sz), N))
+        return LinearIndices(ntuple(Returns(sz), Val(N)))
+    end
+
+    extension && _taylor_extension!(H_n, H, virtual_sz, linds, Val(N), τ)
+
+    mpo = MPO(map(SparseBlockTensorMap, parent(H_n)))
+    _taylor_loopback!(mpo, virtual_sz, linds, Val(N), τ)
+    _taylor_remove_equivalents!(mpo, virtual_sz, linds)
+
+    compression && _taylor_compression!(mpo, virtual_sz, linds, Val(N), τ)
+
+    return remove_orphans!(mpo; tol)
+end
+
+# Stable partition: elements equal to `sentinel` first, others after, preserving relative order within each group.
+@inline function _partition_first(t::NTuple{M, Int}, sentinel::Int) where {M}
+    n_match = count(==(sentinel), t)
+    return ntuple(Val(M)) do j
+        if j <= n_match
+            _kth_match(t, sentinel, j, true)
+        else
+            _kth_match(t, sentinel, j - n_match, false)
+        end
     end
+end
+
+@inline function _kth_match(t::NTuple{M, Int}, sentinel::Int, k::Int, want_match::Bool) where {M}
+    cnt = 0
+    for x in t
+        if (x == sentinel) == want_match
+            cnt += 1
+            cnt == k && return x
+        end
+    end
+    return 0
+end
 
-    # extension step: Algorithm 3
-    # incorporate higher order terms
+# Algorithm 3 of Van Damme et al. (2024): incorporate higher-order corrections
+# from `H_next = H_n * H` into `H_n` in place.
+function _taylor_extension!(H_n, H, virtual_sz, linds, ::Val{N}, τ::Number) where {N}
     # TODO: don't need to fully construct H_next...
-    if alg.extension
-        H_next = H_n * H
-        for (i, slice) in enumerate(parent(H_n))
-            linds_left = linds[i]
-            linds_right = linds[i + 1]
-            V_left = virtual_sz[i]
-            V_right = virtual_sz[i + 1]
-            linds_next_left = LinearIndices(ntuple(Returns(V_left), N + 1))
-            linds_next_right = LinearIndices(ntuple(Returns(V_right), N + 1))
-
-            for a in CartesianIndices(linds_left), b in CartesianIndices(linds_right)
-                all(>(1), b.I) || continue
-                all(in((1, V_left)), a.I) && any(==(V_left), a.I) && continue
-
-                n1 = count(==(1), a.I) + 1
-                n3 = count(==(V_right), b.I) + 1
-                factor = τ * factorial(N) / (factorial(N + 1) * n1 * n3)
-
-                for c in 1:(N + 1), d in 1:(N + 1)
-                    aₑ = insert!([a.I...], c, 1)
-                    bₑ = insert!([b.I...], d, V_right)
-
-                    # TODO: use VectorInterface for memory efficiency
-                    slice[linds_left[a], 1, 1, linds_right[b]] += factor *
-                        H_next[i][linds_next_left[aₑ...], 1, 1, linds_next_right[bₑ...]]
-                end
+    H_next = H_n * H
+    for (i, slice) in enumerate(parent(H_n))
+        linds_left = linds[i]
+        linds_right = linds[i + 1]
+        V_left = virtual_sz[i]
+        V_right = virtual_sz[i + 1]
+        linds_next_left = LinearIndices(ntuple(Returns(V_left), Val(N + 1)))
+        linds_next_right = LinearIndices(ntuple(Returns(V_right), Val(N + 1)))
+
+        for a in CartesianIndices(linds_left), b in CartesianIndices(linds_right)
+            all(>(1), b.I) || continue
+            all(in((1, V_left)), a.I) && any(==(V_left), a.I) && continue
+
+            n1 = count(==(1), a.I) + 1
+            n3 = count(==(V_right), b.I) + 1
+            factor = τ * factorial(N) / (factorial(N + 1) * n1 * n3)
+
+            for c in 1:(N + 1), d in 1:(N + 1)
+                aₑ = TT.insertafter(a.I, c - 1, (1,))
+                bₑ = TT.insertafter(b.I, d - 1, (V_right,))
+
+                # TODO: use VectorInterface for memory efficiency
+                slice[linds_left[a], 1, 1, linds_right[b]] += factor *
+                    H_next[i][linds_next_left[aₑ...], 1, 1, linds_next_right[bₑ...]]
             end
         end
     end
+    return H_n
+end
 
-    # loopback step: Algorithm 1
-    # constructing the Nth order time evolution MPO
-    mpo = MPO(map(SparseBlockTensorMap, parent(H_n)))
+# Algorithm 1: project the auxiliary virtual-bond directions onto the physical
+# block, completing the Nth-order time-evolution MPO.
+function _taylor_loopback!(mpo, virtual_sz, linds, ::Val{N}, τ::Number) where {N}
     for (i, slice) in enumerate(parent(mpo))
         V_right = virtual_sz[i + 1]
         linds_right = linds[i + 1]
@@ -95,14 +138,18 @@ function make_time_mpo(
             end
         end
     end
+    return mpo
+end
 
-    # Remove equivalent rows and columns: Algorithm 2
+# Algorithm 2: collapse rows and columns that are equivalent under the
+# permutation symmetry of the Taylor expansion.
+function _taylor_remove_equivalents!(mpo, virtual_sz, linds)
     for (i, slice) in enumerate(parent(mpo))
         V_left = virtual_sz[i]
         linds_left = linds[i]
         for c in CartesianIndices(linds_left)
             c_lin = linds_left[c]
-            s_c = CartesianIndex(sort(collect(c.I); by = (!=(1)))...)
+            s_c = CartesianIndex(_partition_first(c.I, 1))
 
             n1 = count(==(1), c.I)
             n3 = count(==(V_left), c.I)
@@ -122,7 +169,7 @@ function make_time_mpo(
         linds_right = linds[i + 1]
         for c in CartesianIndices(linds_right)
             c_lin = linds_right[c]
-            s_r = CartesianIndex(sort(collect(c.I); by = (!=(V_right)))...)
+            s_r = CartesianIndex(_partition_first(c.I, V_right))
 
             n1 = count(==(1), c.I)
             n3 = count(==(V_right), c.I)
@@ -138,44 +185,46 @@ function make_time_mpo(
             end
         end
     end
+    return mpo
+end
 
-    # Approximate compression step: Algorithm 4
-    if alg.compression
-        for (i, slice) in enumerate(parent(mpo))
-            V_right = virtual_sz[i + 1]
-            linds_right = linds[i + 1]
-            for a in CartesianIndices(linds_right)
-                all(>(1), a.I) || continue
-                a_lin = linds_right[a]
-                n1 = count(==(V_right), a.I)
-                n1 == 0 && continue
-                b = CartesianIndex(replace(a.I, V_right => 1))
-                b_lin = linds_right[b]
-                factor = τ^n1 * factorial(N - n1) / factorial(N)
-                for k in 1:size(slice, 1)
-                    I = CartesianIndex(k, 1, 1, a_lin)
-                    if I in nonzero_keys(slice)
-                        slice[k, 1, 1, b_lin] += factor * slice[I]
-                        delete!(slice, I)
-                    end
-                end
-            end
-            V_left = virtual_sz[i]
-            linds_left = linds[i]
-            for a in CartesianIndices(linds_left)
-                all(>(1), a.I) || continue
-                a_lin = linds_left[a]
-                n1 = count(==(V_left), a.I)
-                n1 == 0 && continue
-                for k in 1:size(slice, 4)
-                    I = CartesianIndex(a_lin, 1, 1, k)
+# Algorithm 4: approximate compression — fold the right-going `V_right`
+# columns back into the `1`-column with the matching Taylor weight, then drop
+# the residual `V_left` rows.
+function _taylor_compression!(mpo, virtual_sz, linds, ::Val{N}, τ::Number) where {N}
+    for (i, slice) in enumerate(parent(mpo))
+        V_right = virtual_sz[i + 1]
+        linds_right = linds[i + 1]
+        for a in CartesianIndices(linds_right)
+            all(>(1), a.I) || continue
+            a_lin = linds_right[a]
+            n1 = count(==(V_right), a.I)
+            n1 == 0 && continue
+            b = CartesianIndex(map(x -> x == V_right ? 1 : x, a.I))
+            b_lin = linds_right[b]
+            factor = τ^n1 * factorial(N - n1) / factorial(N)
+            for k in 1:size(slice, 1)
+                I = CartesianIndex(k, 1, 1, a_lin)
+                if I in nonzero_keys(slice)
+                    slice[k, 1, 1, b_lin] += factor * slice[I]
                     delete!(slice, I)
                 end
             end
         end
+        V_left = virtual_sz[i]
+        linds_left = linds[i]
+        for a in CartesianIndices(linds_left)
+            all(>(1), a.I) || continue
+            a_lin = linds_left[a]
+            n1 = count(==(V_left), a.I)
+            n1 == 0 && continue
+            for k in 1:size(slice, 4)
+                I = CartesianIndex(a_lin, 1, 1, k)
+                delete!(slice, I)
+            end
+        end
     end
-
-    return remove_orphans!(mpo; tol)
+    return mpo
 end
 
 # Hack to treat FiniteMPOhamiltonians as Infinite
diff --git a/src/algorithms/timestep/wii.jl b/src/algorithms/timestep/wii.jl
index 742992e7a..5c39500fd 100644
--- a/src/algorithms/timestep/wii.jl
+++ b/src/algorithms/timestep/wii.jl
@@ -23,67 +23,82 @@ function make_time_mpo(
         H::InfiniteMPOHamiltonian, dt::Number, alg::WII;
         imaginary_evolution::Bool = false
     )
-    WA = H.A
-    WB = H.B
-    WC = H.C
-    WD = H.D
-
-    # needs to be complex if negative because of square root
-    δ = imaginary_evolution ? -complex(dt) : (-1im * dt)
+    T = complex(promote_type(scalartype(H), typeof(dt)))
+    δ = imaginary_evolution ? T(-dt) : T(-im * dt)
     exp_alg = Arnoldi(; alg.tol, alg.maxiter)
 
-    Wnew = map(1:length(H)) do i
-        for j in 2:(size(H[i], 1) - 1), k in 2:(size(H[i], 4) - 1)
-            init = (
-                isometry(storagetype(WD[i]), codomain(WD[i]), domain(WD[i])),
-                zero(H[i][1, 1, 1, k]), zero(H[i][j, 1, 1, end]), zero(H[i][j, 1, 1, k]),
-            )
-
-            y, convhist = exponentiate(1.0, init, exp_alg) do (x1, x2, x3, x4)
-                @plansor y1[-1 -2; -3 -4] := δ * x1[-1 1; -3 -4] *
-                    H[i][1, 1, 1, end][2 3; 1 4] * τ[-2 4; 2 3]
-
-                @plansor y2[-1 -2; -3 -4] := δ * x2[-1 1; -3 -4] *
-                    H[i][1, 1, 1, end][2 3; 1 4] * τ[-2 4; 2 3] +
-                    sqrt(δ) * x1[1 2; -3 4] * H[i][1, 1, 1, k][-1 -2; 3 -4] * τ[3 4; 1 2]
-
-                @plansor y3[-1 -2; -3 -4] := δ * x3[-1 1; -3 -4] *
-                    H[i][1, 1, 1, end][2 3; 1 4] * τ[-2 4; 2 3] +
-                    sqrt(δ) * x1[1 2; -3 4] * H[i][j, 1, 1, end][-1 -2; 3 -4] * τ[3 4; 1 2]
-
-                @plansor y4[-1 -2; -3 -4] := (
-                    δ * x4[-1 1; -3 -4] * H[i][1, 1, 1, end][2 3; 1 4] * τ[-2 4; 2 3] +
-                        x1[1 2; -3 4] * H[i][j, 1, 1, k][-1 -2; 3 -4] * τ[3 4; 1 2]
-                ) + (
-                    sqrt(δ) * x2[1 2; -3 -4] * H[i][j, 1, 1, end][-1 -2; 3 4] * τ[3 4; 1 2]
-                ) + (sqrt(δ) * x3[-1 4; -3 3] * H[i][1, 1, 1, k][2 -2; 1 -4] * τ[3 4; 1 2])
-
-                return y1, y2, y3, y4
-            end
-
-            convhist.converged == 0 &&
-                @warn "failed to exponentiate $(convhist.normres)"
-
-            WA[i][j - 1, 1, 1, k - 1] = y[4]
-            WB[i][j - 1, 1, 1, 1] = y[3]
-            WC[i][1, 1, 1, k - 1] = y[2]
-            WD[i] = y[1]
-        end
-
-        Vₗ = left_virtualspace(H, i)[1:(end - 1)]
-        Vᵣ = right_virtualspace(H, i)[1:(end - 1)]
-        P = physicalspace(H, i)
-
-        h′ = similar(H[i], Vₗ ⊗ P ← P ⊗ Vᵣ)
-        h′[2:end, 1, 1, 2:end] = WA[i]
-        h′[2:end, 1, 1, 1] = WB[i]
-        h′[1, 1, 1, 2:end] = WC[i]
-        h′[1, 1, 1, 1] = WD[i]
-
-        return h′
+    O = tmap(parent(parent(H)); scheduler = Defaults.scheduler[]) do W
+        return _make_time_mpo(W, δ, exp_alg)
+    end
+    return InfiniteMPO(PeriodicArray(O))
+end
+
+function _make_time_mpo(W::JordanMPOTensor, δ, exp_alg)
+    # Allocate output
+    Vₗ = left_virtualspace(W)[1:(end - 1)]
+    Vᵣ = right_virtualspace(W)[1:(end - 1)]
+    P = physicalspace(W)
+    O = similar(W, storagetype(W), Vₗ ⊗ P ← P ⊗ Vᵣ)
+
+    # fill onsite first: treated exactly
+    expD = exp!(scale(only(W.D), δ))
+    O[1, 1, 1, 1] = insertrightunit(insertleftunit(expD, 1), 3)
+
+    # remaining entries:
+    D = W[1, 1, 1, end]
+    for j in 2:(size(W, 1) - 1), k in 2:(size(W, 4) - 1)
+        B = W[j, 1, 1, end]
+        C = W[1, 1, 1, k]
+        A = W[j, 1, 1, k]
+        O[j, 1, 1, k], O[j, 1, 1, 1], O[1, 1, 1, k], _ =
+            wii_solve_block!(W, A, B, C, D, δ, exp_alg)
     end
 
-    return InfiniteMPO(PeriodicArray(Wnew))
+    return O
+end
+
+function wii_solve_block!(
+        W, A, B, C, D, δ::T, exp_alg
+    ) where {T <: Number}
+    init = (zero(A), zero(B), zero(C), isometry(storagetype(D), space(D)))
+    op = WIIStep(A, B, C, D, δ)
+    (yᴬ, yᴮ, yᶜ, yᴰ), convhist = exponentiate(op, one(real(scalartype(W))), init, exp_alg)
+    convhist.converged == 0 && @warn "failed to exponentiate $(convhist.normres)"
+    return yᴬ, yᴮ, yᶜ, yᴰ
+end
+
+# Replaces the `do`-block closure passed to `exponentiate`.
+struct WIIStep{HA, HB, HC, HD, T <: Number}
+    A::HA  # interior  / propagation block
+    B::HB  # right boundary / operator end
+    C::HC  # left boundary  / operator start
+    D::HD  # onsite / diagonal
+    δ::T
+end
+
+function (op::WIIStep)((xᴬ, xᴮ, xᶜ, xᴰ))
+    δ = op.δ
+    sqrtδ = sqrt(δ)
+
+    @plansor yᴰ[-1 -2; -3 -4] := δ * xᴰ[-1 1; -3 -4] *
+        op.D[2 3; 1 4] * τ[-2 4; 2 3]
+
+    @plansor yᶜ[-1 -2; -3 -4] := δ * xᶜ[-1 1; -3 -4] *
+        op.D[2 3; 1 4] * τ[-2 4; 2 3] +
+        sqrtδ * xᴰ[1 2; -3 4] * op.C[-1 -2; 3 -4] * τ[3 4; 1 2]
+
+    @plansor yᴮ[-1 -2; -3 -4] := δ * xᴮ[-1 1; -3 -4] *
+        op.D[2 3; 1 4] * τ[-2 4; 2 3] +
+        sqrtδ * xᴰ[1 2; -3 4] * op.B[-1 -2; 3 -4] * τ[3 4; 1 2]
+
+    @plansor yᴬ[-1 -2; -3 -4] := (
+        δ * xᴬ[-1 1; -3 -4] * op.D[2 3; 1 4] * τ[-2 4; 2 3] +
+            xᴰ[1 2; -3 4] * op.A[-1 -2; 3 -4] * τ[3 4; 1 2]
+    ) + (
+        sqrtδ * xᶜ[1 2; -3 -4] * op.B[-1 -2; 3 4] * τ[3 4; 1 2]
+    ) + (sqrtδ * xᴮ[-1 4; -3 3] * op.C[2 -2; 1 -4] * τ[3 4; 1 2])
+
+    return yᴬ, yᴮ, yᶜ, yᴰ
 end
 
 # Hack to treat FiniteMPOhamiltonians as Infinite