diff --git a/.github/actions/pip_installation/action.yml b/.github/actions/pip_installation/action.yml
index 7e793b7c..a059cad1 100644
--- a/.github/actions/pip_installation/action.yml
+++ b/.github/actions/pip_installation/action.yml
@@ -14,8 +14,18 @@ runs:
     - name: Install dependencies
       shell: bash
       run: |
-              export CC="mpicc" HDF5_MPI="ON"
-              python -m pip install -r requirements.txt
-              python -m pip install numba
-              python -m pip install pythran
-
+        export CC="mpicc" HDF5_MPI="ON"
+        python -m pip install -r requirements.txt
+        python -m pip install numba
+        python -m pip install pythran
+    - uses: actions/checkout@v5
+      with:
+        repository: EmilyBourne/hptt
+        path: hptt
+    - name: Install hptt
+      shell: bash
+      run: |
+        cd hptt
+        make -j
+        cd pythonAPI
+        python -m pip install .
diff --git a/fullSimulation.py b/fullSimulation.py
index 8e9989c6..97024b65 100644
--- a/fullSimulation.py
+++ b/fullSimulation.py
@@ -218,6 +218,12 @@ def my_print(rank, *args, **kwargs):
     average_output = 0
     startPrint = max(0, ti % saveStep)
     timeForLoop = True
+
+    timeQN = 0
+    timeFluxAdv = 0
+    timevParAdv = 0
+    timePolAdv = 0
+
     while (ti < tN and timeForLoop):
 
         full_loop_start = time.time()
@@ -228,49 +234,77 @@ def my_print(rank, *args, **kwargs):
         # Compute f^n+1/2 using Lie splitting
         distribFunc.setLayout('flux_surface')
         distribFunc.saveGridValues()
+        time_start = time.time()
         fluxAdv.gridStep(distribFunc)
+        timeFluxAdv += (time.time()-time_start)
         distribFunc.setLayout('v_parallel')
         phi.setLayout('v_parallel_1d')
+        time_start = time.time()
         vParAdv.gridStep(distribFunc, phi, parGrad, parGradVals, halfStep)
+        timevParAdv += (time.time()-time_start)
         distribFunc.setLayout('poloidal')
         phi.setLayout('poloidal')
+        time_start = time.time()
         polAdv.gridStep(distribFunc, phi, halfStep)
+        timePolAdv += (time.time()-time_start)
 
         # Find phi from f^n+1/2 by solving QN eq again
         distribFunc.setLayout('v_parallel')
+        time_start = time.time()
         density.getPerturbedRho(distribFunc, rho)
+        timeQN += (time.time()-time_start)
         QNSolver.getModes(rho)
         rho.setLayout('mode_solve')
         phi.setLayout('mode_solve')
+        time_start = time.time()
         QNSolver.solveEquation(phi, rho)
+        timeQN += (time.time()-time_start)
         phi.setLayout('v_parallel_2d')
         rho.setLayout('v_parallel_2d')
+        time_start = time.time()
         QNSolver.findPotential(phi)
+        timeQN += (time.time()-time_start)
 
         # Compute f^n+1 using strang splitting
         distribFunc.restoreGridValues()  # restored from flux_surface layout
+        time_start = time.time()
         fluxAdv.gridStep(distribFunc)
+        timeFluxAdv += (time.time()-time_start)
         distribFunc.setLayout('v_parallel')
         phi.setLayout('v_parallel_1d')
+        time_start = time.time()
         vParAdv.gridStep(distribFunc, phi, parGrad, parGradVals, halfStep)
+        timevParAdv += (time.time()-time_start)
         distribFunc.setLayout('poloidal')
         phi.setLayout('poloidal')
+        time_start = time.time()
         polAdv.gridStep(distribFunc, phi, fullStep)
+        timePolAdv += (time.time()-time_start)
         distribFunc.setLayout('v_parallel')
+        time_start = time.time()
         vParAdv.gridStepKeepGradient(distribFunc, parGradVals, halfStep)
+        timevParAdv += (time.time()-time_start)
         distribFunc.setLayout('flux_surface')
+        time_start = time.time()
         fluxAdv.gridStep(distribFunc)
+        timeFluxAdv += (time.time()-time_start)
 
         # Find phi from f^n by solving QN eq
         distribFunc.setLayout('v_parallel')
+        time_start = time.time()
         density.getPerturbedRho(distribFunc, rho)
+        timeQN += (time.time()-time_start)
         QNSolver.getModes(rho)
         rho.setLayout('mode_solve')
         phi.setLayout('mode_solve')
+        time_start = time.time()
         QNSolver.solveEquation(phi, rho)
+        timeQN += (time.time()-time_start)
         phi.setLayout('v_parallel_2d')
         rho.setLayout('v_parallel_2d')
+        time_start = time.time()
         QNSolver.findPotential(phi)
+        timeQN += (time.time()-time_start)
 
         diagnostic_start = time.time()
         # Calculate diagnostic quantities
@@ -301,8 +335,6 @@ def my_print(rank, *args, **kwargs):
         timeForLoop = comm.allreduce((time.time(
         ) - setup_time_start + 2*average_loop + 2*average_output) < stopTime, op=MPI.LAND)
 
-    full_loop_time += (time.time()-full_loop_start)
-
     output_start = time.time()
 
     if (ti % saveStep != 0):
@@ -327,15 +359,31 @@ def my_print(rank, *args, **kwargs):
         print(s.getvalue(), file=open("profile/l2Test{}.txt".format(rank), "w"))
 
     if (rank == 0):
-        if (not os.path.isdir("timing")):
-            os.mkdir("timing")
+        if (not os.path.isdir(f"{foldername}/timing")):
+            os.mkdir(f"{foldername}/timing")
 
     MPI.COMM_WORLD.Barrier()
 
-    print("{loop:16.10e}   {output:16.10e}   {setup:16.10e}   {diagnostic:16.10e}".
-          format(loop=full_loop_time, output=output_time, setup=setup_time,
-                 diagnostic=diagnostic_time),
-          file=open("timing/{}_l2Test{}.txt".format(MPI.COMM_WORLD.Get_size(), rank), "w"))
+    mpi_time = distribFunc._layout_manager._mpi_time
+    transp_time = distribFunc._layout_manager._transpose_time - mpi_time
+
+    timeOther = full_loop_time - sum((timeQN, timeFluxAdv, timevParAdv, timePolAdv, transp_time, mpi_time))
+    with open(f"{foldername}/timing/{MPI.COMM_WORLD.Get_size()}_l2Test{rank}.txt", "w") as timing_file:
+        for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, transp_time, mpi_time, timeOther, full_loop_time, output_time, setup_time, diagnostic_time):
+            print(f"{t:16.10e}   ", end="", file=timing_file)
+        print(file=timing_file)
+
+    if rank == 0:
+        print("Timings")
+        print("-------")
+        print("QN  | Flux | vPar | Pol | MPI | transpOther | Total")
+        for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, mpi_time, transp_time, timeOther):
+            print(round(t / full_loop_time, 3), end=' | ')
+        print(1)
+        print()
+        for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, timeOther):
+            print(t, end=' | ')
+        print(full_loop_time)
 
 
 if __name__ == "__main__":
diff --git a/pygyro/model/grid.py b/pygyro/model/grid.py
index 5c274a3c..e29c3f15 100644
--- a/pygyro/model/grid.py
+++ b/pygyro/model/grid.py
@@ -94,6 +94,16 @@ def getGlobalIndices(self, *indices: 'ints'):
             result[self._layout.dims_order[i]] = indices[i]+toAdd
         return result
 
+    def get3DSlice(self, *slices: 'ints'):
+        """ get the 3D slice at the provided list of coordinates
+        """
+        assert len(slices) == self._nDims-3
+        slices = slices + (slice(self._nGlobalCoords[self._layout.dims_order[-3]]),
+                           slice(
+                               self._nGlobalCoords[self._layout.dims_order[-2]]),
+                           slice(self._nGlobalCoords[self._layout.dims_order[-1]]))
+        return self._f[tuple(slices)]
+
     def get2DSlice(self, *slices: 'ints'):
         """ get the 2D slice at the provided list of coordinates
         """
diff --git a/pygyro/model/layout.py b/pygyro/model/layout.py
index 738299d6..e43d6013 100644
--- a/pygyro/model/layout.py
+++ b/pygyro/model/layout.py
@@ -2,10 +2,19 @@
 import numpy as np
 import warnings
 import operator
+import time
+import hptt
 
 from abc import ABC
 
 
+def my_transpose(dest, source, axes):
+    if axes == list(range(len(axes))):
+        dest[:] = source
+    else:
+        hptt.tensorTransposeAndUpdate(axes, 1.0, source, 0.0, dest)
+
+
 class Layout:
     """
     Layout: Class containing information about how to access data
@@ -427,6 +436,8 @@ def __init__(self, comms: list, coords: list, layouts: dict, nprocs: list, eta_g
             self._shapes.append((name, new_layout.shape))
         self._layouts = dict(layoutObjects)
         self.nLayouts = len(self._layouts)
+        self._mpi_time = 0
+        self._transpose_time = 0
 
         # Initialise the buffer size before the loop
         self._buffer_size = layoutObjects[0][1].size
@@ -513,6 +524,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None):
         source and dest are assumed to not overlap in memory
 
         """
+        start_time = time.time()
         # If this thread is only here for plotting purposes then ignore the command
         if (self._buffer_size == 0):
             return
@@ -549,6 +561,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None):
             else:
                 self._transposeRedirect_source_intact(
                     source, dest, buf, source_name, dest_name)
+        self._transpose_time += (time.time() - start_time)
 
     def _transposeRedirect(self, source, dest, source_name, dest_name):
         """
@@ -644,7 +657,8 @@ def _transpose(self, source, dest, layout_source, layout_dest):
 
             transposition = [layout_source.dims_order.index(
                 i) for i in layout_dest.dims_order]
-            destView[:] = np.transpose(sourceView, transposition)
+            my_transpose(destView, sourceView, transposition)
+            # destView[:] = np.transpose(sourceView, transposition)
 
             return
 
@@ -662,8 +676,12 @@ def _transpose_source_intact(self, source, dest, buf, layout_source, layout_dest
         TODO
         """
         # get views of the important parts of the data
+        assert source.flags['C_CONTIGUOUS'] or source.flags['F_CONTIGUOUS']
+        assert np.split(source, [layout_source.size])[0].flags['C_CONTIGUOUS'] or np.split(
+            source, [layout_source.size])[0].flags['F_CONTIGUOUS']
         sourceView = np.split(source, [layout_source.size])[
             0].reshape(layout_source.shape)
+        assert sourceView.flags['C_CONTIGUOUS'] or sourceView.flags['F_CONTIGUOUS']
 
         # get axis information
         axis = self._get_swap_axes(layout_source, layout_dest)
@@ -674,7 +692,8 @@ def _transpose_source_intact(self, source, dest, buf, layout_source, layout_dest
                 0].reshape(layout_dest  .shape)
             transposition = [layout_source.dims_order.index(
                 i) for i in layout_dest.dims_order]
-            dest[:] = np.transpose(sourceView, transposition)
+            my_transpose(dest, sourceView, transposition)
+            # dest[:] = np.transpose(sourceView, transposition)
             return
 
         # carry out transpose
@@ -756,12 +775,15 @@ def _extract_from_source(source, tobuffer, layout_source: Layout,
             # the size of the block
             # The data should however be written directly in the buffer
             # as the shapes agree
-            arrView[:] = source[tuple(source_range)].transpose(order)
+            dest_block = np.empty(arrView.shape, dtype=arrView.dtype)
+            my_transpose(dest_block, np.ascontiguousarray(
+                source[tuple(source_range)]), order)
+            arrView[:] = dest_block
+            # arrView[:] = source[tuple(source_range)].transpose(order)
 
             start += size
 
-    @staticmethod
-    def _rearrange_from_buffer(data, buf, layout_source: Layout,
+    def _rearrange_from_buffer(self, data, buf, layout_source: Layout,
                                layout_dest: Layout, axis: list, comm: MPI.Comm):
         """
         Swap the axes of the blocks
@@ -785,7 +807,9 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout,
         rcvBuf = np.split(buf, [size], axis=0)[0]
         assert rcvBuf.base is buf
 
+        start_time = time.time()
         comm.Alltoall(sendBuf, rcvBuf)
+        self._mpi_time += time.time() - start_time
 
         source_order = list(layout_source.dims_order)
 
@@ -810,7 +834,8 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout,
         if (layout_dest.shape[axis[2]] % mpi_size == 0 and layout_source.shape[axis[1]] % mpi_size == 0):
             # If all blocks are the same shape with no padding then the
             # transposition can be carried out directly
-            destView[:] = np.transpose(bufView, transposition)
+            # destView[:] = np.transpose(bufView, transposition)
+            my_transpose(destView, bufView, transposition)
 
         else:
             for r in range(mpi_size):
@@ -831,8 +856,13 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout,
 
                 # Transpose the data. As the axes to be concatenated are the first dimension
                 # the concatenation is done automatically
-                destView[tuple(destRanges)] = np.transpose(
-                    bufView[tuple(bufRanges)], transposition)
+                # destView[tuple(destRanges)] = np.transpose(
+                #    bufView[tuple(bufRanges)], transposition)
+                dest_block = np.empty(
+                    destView[tuple(destRanges)].shape, dtype=destView.dtype)
+                my_transpose(dest_block, np.ascontiguousarray(
+                    bufView[tuple(bufRanges)]), transposition)
+                destView[tuple(destRanges)] = dest_block
 
     def compatible(self, l1: Layout, l2: Layout):
         """
@@ -932,6 +962,7 @@ def __init__(self, comm: MPI.Comm, layouts: list, nprocs: list, eta_grids: list,
             enumerate(self._nDims), key=operator.itemgetter(1))
 
         self._totProcs = np.prod(nprocs[self._largestLayoutManager])
+        self._transpose_time = 0
 
         # Get a list of the distribution pattern for each layout type
         self._nprocs = []
@@ -1240,6 +1271,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None):
         source and dest are assumed to not overlap in memory
 
         """
+        start_time = time.time()
         # If this thread is only here for plotting purposes then ignore the command
         if (self._buffer_size == 0):
             return
@@ -1276,6 +1308,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None):
                 self._transposeRedirect_source_intact(
                     source, dest, buf, source_name, dest_name)
         self._current_manager = self._managers[self._handlers[dest_name]]
+        self._transpose_time += (time.time() - start_time)
 
     def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout):
         """
@@ -1300,7 +1333,8 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout):
                 i) for i in layout_dest.dims_order]
 
             # Copy the relevant information
-            destView[:] = np.transpose(sourceView, transposition)
+            # destView[:] = np.transpose(sourceView, transposition)
+            my_transpose(destView, sourceView, transposition)
 
         elif (dest_ndims > source_ndims):
             # If the source has fewer dimensions then the layout was not
@@ -1329,8 +1363,10 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout):
                 i) for i in layout_dest.dims_order]
 
             # Copy the relevant information
-            destView[:] = np.transpose(
-                sourceView[tuple(sourceSlice)], transposition)
+            # destView[:] = np.transpose(
+            #    sourceView[tuple(sourceSlice)], transposition)
+            my_transpose(destView, np.ascontiguousarray(
+                sourceView[tuple(sourceSlice)]), transposition)
 
         else:
             # Find the axis which will be distributed
@@ -1379,7 +1415,11 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout):
                 block = np.split(b, [blockSize])[0].reshape(blockShape)
 
                 # Copy the block into the correct part of the memory
-                destView[tuple(slices)] = np.transpose(block, transposition)
+                # destView[tuple(slices)] = np.transpose(block, transposition)
+                dest_block = np.empty(
+                    destView[tuple(slices)].shape, dtype=destView.dtype)
+                my_transpose(dest_block, block, transposition)
+                destView[tuple(slices)] = dest_block
 
             # The data now resides on the wrong memory chunk and must be copied
             dest[:] = source[:]
@@ -1407,7 +1447,8 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay
                 i) for i in layout_dest.dims_order]
 
             # Copy the relevant information
-            destView[:] = np.transpose(sourceView, transposition)
+            # destView[:] = np.transpose(sourceView, transposition)
+            my_transpose(destView, sourceView, transposition)
 
         elif (dest_ndims > source_ndims):
             # If the source has fewer dimensions then the layout was not
@@ -1436,8 +1477,10 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay
                 i) for i in layout_dest.dims_order]
 
             # Copy the relevant information
-            destView[:] = np.transpose(
-                sourceView[tuple(sourceSlice)], transposition)
+            # destView[:] = np.transpose(
+            #    sourceView[tuple(sourceSlice)], transposition)
+            my_transpose(destView, np.ascontiguousarray(
+                sourceView[tuple(sourceSlice)]), transposition)
         else:
             # Find the axis which will be distributed
             idx_d, idx_s = self.getAxes(layout_dest, layout_source)
@@ -1485,7 +1528,11 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay
                 block = np.split(b, [blockSize])[0].reshape(blockShape)
 
                 # Copy the block into the correct part of the memory
-                destView[tuple(slices)] = np.transpose(block, transposition)
+                # destView[tuple(slices)] = np.transpose(block, transposition)
+                dest_block = np.empty(
+                    destView[tuple(slices)].shape, dtype=destView.dtype)
+                my_transpose(dest_block, block, transposition)
+                destView[tuple(slices)] = dest_block
 
     def _transposeRedirect(self, source, dest, source_name, dest_name):
         """
diff --git a/pygyro/model/test_layout.py b/pygyro/model/test_layout.py
index 1046adc0..835c5b8a 100644
--- a/pygyro/model/test_layout.py
+++ b/pygyro/model/test_layout.py
@@ -126,11 +126,11 @@ def test_OddLayoutPaths():
     assert layout1.name == '0123'
     layout2 = remapper.getLayout('1230')
 
-    fStart = np.empty(remapper.bufferSize, int)
+    fStart = np.empty(remapper.bufferSize, complex)
     fStart[:] = -1
-    fEnd = np.empty(remapper.bufferSize, int)
+    fEnd = np.empty(remapper.bufferSize, complex)
     fEnd[:] = -1
-    fBuf = np.empty(remapper.bufferSize, int)
+    fBuf = np.empty(remapper.bufferSize, complex)
     fBuf[:] = -1
     f1_s = np.split(fStart, [layout1.size])[0].reshape(layout1.shape)
     f2_s = np.split(fStart, [layout2.size])[0].reshape(layout2.shape)
diff --git a/pygyro/tools/getTimings.py b/pygyro/tools/getTimings.py
index 954fabed..c420352a 100644
--- a/pygyro/tools/getTimings.py
+++ b/pygyro/tools/getTimings.py
@@ -8,24 +8,30 @@
 args = parser.parse_args()
 nprocs = args.nprocs[0]
 
+steps = np.empty((nprocs, 7))
 loop = np.empty(nprocs)
 output = np.empty(nprocs)
 setup = np.empty(nprocs)
 additional = np.empty(nprocs)
 
 for i in range(nprocs):
-    file_object = open("timing/{}_l2Test{}.txt".format(nprocs, i), "r")
-    loop[i] = float(file_object.read(19))
-    output[i] = float(file_object.read(19))
-    setup[i] = float(file_object.read(19))
-    additional[i] = float(file_object.read(16))
-    file_object.close()
+    with open(f"timing/{nprocs}_l2Test{i}.txt", "r") as file_object:
+        for j in range(7):
+            steps[i, j] = float(file_object.read(19))
+        loop[i] = float(file_object.read(19))
+        output[i] = float(file_object.read(19))
+        setup[i] = float(file_object.read(19))
+        additional[i] = float(file_object.read(16))
 
 loop_time = max(loop)
 output_time = max(output)
 setup_time = max(setup)
 additional_time = max(additional)
 
-file_object = open("results.txt", "a")
-file_object.write("{nprocs:8}   {loop:16.10e}   {output:16.10e}   {setup:16.10e}   {additional:16.10e}\n"
-                  .format(nprocs=nprocs, loop=loop_time, output=output_time, setup=setup_time, additional=additional_time))
+with open("results.txt", "a") as file_object:
+    file_object.write(f"{nprocs:8}   ")
+    for j in range(7):
+        file_object.write(f"{max(steps[:, j]):8}   ")
+    file_object.write(
+        f"{loop_time:16.10e}   {output_time:16.10e}   {setup_time:16.10e}   {additional_time:16.10e}\n")
+    # file_object.write(f"{nprocs:8}   {loop_time:16.10e}   {output_time:16.10e}   {setup_time:16.10e}   {additional_time:16.10e}\n")
diff --git a/requirements.txt b/requirements.txt
index b0dbd305..0575a8c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ pytest>=2.8.7
 mpi4py>=3.0.0
 h5py>=2.8.0
 git+https://github.com/pyccel/pyccel.git@supercomputing
+#git+https://github.com/EmilyBourne/hptt.git@master
 #pyccel>=2.0.0
 
 # h5py must be built from source using MPI compiler