diff --git a/Project.toml b/Project.toml
index 520d6cb..f695fc6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -36,7 +36,7 @@ MatrixAlgebraKit = "0.6"
 Random = "1"
 SafeTestsets = "0.1"
 Strided = "2.3.3"
-TensorKit = "0.16.4"
+TensorKit = "0.16, 0.17"
 TensorOperations = "5"
 Test = "1"
 TestExtras = "0.2, 0.3"
diff --git a/ext/BlockTensorKitGPUArraysExt.jl b/ext/BlockTensorKitGPUArraysExt.jl
index ff5f217..2d220c0 100644
--- a/ext/BlockTensorKitGPUArraysExt.jl
+++ b/ext/BlockTensorKitGPUArraysExt.jl
@@ -8,4 +8,15 @@ function KernelAbstractions.get_backend(BA::BlockArrays.BlockArray{T, N, A}) whe
     return KernelAbstractions.get_backend(first(BA.blocks))
 end
 
+function Base.copyto!(dest::BM, src::TA) where {T <: Number, TA <: AnyGPUMatrix{T}, BM <: BlockMatrix{T, Matrix{TA}}}
+    # TODO -- should we use Threads here to parallelize these
+    # transfers in streams if possible?
+    for block_index in Iterators.product(blockaxes(dest)...)
+        indices = getindex.(axes(dest), block_index)
+        dest_view = @view dest[block_index...]
+        dest_view .= src[indices...]
+    end
+    return dest
+end
+
 end
diff --git a/src/linalg/factorizations.jl b/src/linalg/factorizations.jl
index 75045eb..94cf57f 100644
--- a/src/linalg/factorizations.jl
+++ b/src/linalg/factorizations.jl
@@ -25,11 +25,10 @@ for f! in (
     )
     @eval function MAK.$f!(t::AbstractBlockTensorMap, F, alg::AbstractAlgorithm)
         TensorKit.foreachblock(t, F...) do _, (tblock, Fblocks...)
-            dense_block = similar_dense(tblock)
-            Fblocks′ = MAK.$f!(copy_dense!(dense_block, tblock), alg)
+            Fblocks′ = MAK.$f!(copy_dense!(similar_dense(tblock), tblock), alg)
             # deal with the case where the output is not in-place
             for (b′, b) in zip(Fblocks′, Fblocks)
-                b === b′ || copy!(b, b′)
+                b === b′ || copyto!(b, b′)
             end
             return nothing
         end
@@ -45,10 +44,9 @@ for f! in (
     )
     @eval function MAK.$f!(t::AbstractBlockTensorMap, N, alg::AbstractAlgorithm)
         TensorKit.foreachblock(t, N) do _, (tblock, Nblock)
-            dense_block = similar_dense(tblock)
-            Nblock′ = MAK.$f!(copy_dense!(dense_block, tblock), alg)
+            Nblock′ = MAK.$f!(copy_dense!(similar_dense(tblock), tblock), alg)
             # deal with the case where the output is not the same as the input
-            Nblock === Nblock′ || copy!(Nblock, Nblock′)
+            Nblock === Nblock′ || copyto!(Nblock, Nblock′)
             return nothing
         end
         return N
@@ -190,3 +188,30 @@ for f! in (
     @eval MAK.$f!(::AbstractBlockTensorMap, x, ::DiagonalAlgorithm) =
         error("Blocktensors are incompatible with diagonal algorithm")
 end
+
+function TensorKit.Factorizations.truncate_domain!(tdst::AbstractBlockTensorMap, tsrc::AbstractBlockTensorMap, inds)
+    TensorKit.foreachblock(tdst, tsrc) do c, (dst_block, src_block)
+        I = get(inds, c, nothing)
+        dst_dense = copy_dense!(similar_dense(dst_block), dst_block)
+        src_dense = copy_dense!(similar_dense(src_block), src_block)
+        @assert !isnothing(I)
+        @views dst_dense .= src_dense[:, I]
+        # deal with the case where the output is not in-place
+        dst_dense === dst_block || copyto!(dst_block, dst_dense)
+        return nothing
+    end
+    return tdst
+end
+function TensorKit.Factorizations.truncate_codomain!(tdst::AbstractBlockTensorMap, tsrc::AbstractBlockTensorMap, inds)
+    TensorKit.foreachblock(tdst, tsrc) do c, (dst_block, src_block)
+        I = get(inds, c, nothing)
+        dst_dense = copy_dense!(similar_dense(dst_block), dst_block)
+        src_dense = copy_dense!(similar_dense(src_block), src_block)
+        @assert !isnothing(I)
+        @views dst_dense .= src_dense[I, :]
+        # deal with the case where the output is not in-place
+        dst_dense === dst_block || copyto!(dst_block, dst_dense)
+        return nothing
+    end
+    return tdst
+end
diff --git a/src/linalg/linalg.jl b/src/linalg/linalg.jl
index d2c1330..f276e42 100644
--- a/src/linalg/linalg.jl
+++ b/src/linalg/linalg.jl
@@ -239,3 +239,23 @@ function LinearAlgebra.isposdef!(t::AbstractBlockTensorMap)
     end
     return true
 end
+
+function LinearAlgebra.lmul!(D::DiagonalTensorMap, t::AbstractBlockTensorMap)
+    domain(D) == codomain(t) || throw(SpaceMismatch())
+    TensorKit.foreachblock(t, D) do c, (tblock, bs...)
+        tblock′ = lmul!(bs..., copy_dense!(similar_dense(tblock), tblock))
+        tblock === tblock′ || copyto!(tblock, tblock′)
+        return tblock
+    end
+    return t
+end
+
+function LinearAlgebra.rmul!(t::AbstractBlockTensorMap, D::DiagonalTensorMap)
+    codomain(D) == domain(t) || throw(SpaceMismatch())
+    TensorKit.foreachblock(t, D) do c, (tblock, bs...)
+        tblock′ = rmul!(copy_dense!(similar_dense(tblock), tblock), bs...)
+        tblock === tblock′ || copyto!(tblock, tblock′)
+        return tblock
+    end
+    return t
+end
diff --git a/src/tensors/abstractblocktensor/abstractarray.jl b/src/tensors/abstractblocktensor/abstractarray.jl
index 1f07f99..e099a83 100644
--- a/src/tensors/abstractblocktensor/abstractarray.jl
+++ b/src/tensors/abstractblocktensor/abstractarray.jl
@@ -288,7 +288,7 @@ function similar_tensormaptype(
     ) where {S}
     if eltype(t) === T && typeof(space(t)) === typeof(P)
         return T
-    elseif isconcretetype(T)
+    elseif isconcretetype(T) || T isa Union
         return tensormaptype(S, numout(P), numin(P), storagetype(T))
     else
         return AbstractTensorMap{scalartype(T), S, numout(P), numin(P)}
diff --git a/src/tensors/blocktensor.jl b/src/tensors/blocktensor.jl
index 1ee6fa5..97fd0df 100644
--- a/src/tensors/blocktensor.jl
+++ b/src/tensors/blocktensor.jl
@@ -161,6 +161,10 @@ end
 Base.eltype(::Type{<:BlockTensorMap{TT}}) where {TT} = TT
 Base.parent(t::BlockTensorMap) = t.data
 
+# handle this separately because the storagetype of `AbstractTensorMap` is
+# *always* Vector no matter the actual data storage type
+TK.storagetype(t::BlockTensorMap{AbstractTensorMap{E, S, N₁, N₂}}) where {E, S, N₁, N₂} = TK.promote_storagetype(values(t.data)...)
+
 function Base.copyto!(
         dest::BlockTensorMap, Rdest::CartesianIndices,
         src::BlockTensorMap, Rsrc::CartesianIndices,
diff --git a/src/tensors/sparseblocktensor.jl b/src/tensors/sparseblocktensor.jl
index 9756824..a170b82 100644
--- a/src/tensors/sparseblocktensor.jl
+++ b/src/tensors/sparseblocktensor.jl
@@ -168,6 +168,10 @@ VI.scalartype(::Type{<:SparseBlockTensorMap{TT}}) where {TT} = scalartype(TT)
 Base.parent(t::SparseBlockTensorMap) = SparseTensorArray(t.data, space(t))
 Base.eltype(::Type{<:SparseBlockTensorMap{TT}}) where {TT} = TT
 
+# handle this separately because the storagetype of `AbstractTensorMap` is
+# *always* Vector no matter the actual data storage type
+TK.storagetype(t::SparseBlockTensorMap{AbstractTensorMap{E, S, N₁, N₂}}) where {E, S, N₁, N₂} = TK.promote_storagetype(values(t.data)...)
+
 issparse(::SparseBlockTensorMap) = true
 nonzero_keys(t::SparseBlockTensorMap) = keys(t.data)
 nonzero_values(t::SparseBlockTensorMap) = values(t.data)
diff --git a/src/tensors/tensoroperations.jl b/src/tensors/tensoroperations.jl
index 80bb85a..3e5e766 100644
--- a/src/tensors/tensoroperations.jl
+++ b/src/tensors/tensoroperations.jl
@@ -31,7 +31,10 @@ for TTA in (:AbstractTensorMap, :AbstractBlockTensorMap), TTB in (:AbstractTenso
         ) where {N₁, N₂}
         S = TK.check_spacetype(A, B)
         TC′ = TK.promote_permute(TC, sectortype(S))
-        M = TK.promote_storagetype(TK.similarstoragetype(A, TC′), TK.similarstoragetype(B, TC′))
+        # explicitly compute storagetype here to work around eltype of AbstractTensorMap
+        MA = TK.similarstoragetype(TK.storagetype(A), TC′)
+        MB = TK.similarstoragetype(TK.storagetype(B), TC′)
+        M = TK.promote_storagetype(MA, MB)
         return if issparse(A) && issparse(B)
             sparseblocktensormaptype(S, N₁, N₂, M)
         else