diff --git a/.github/actions/pip_installation/action.yml b/.github/actions/pip_installation/action.yml index 7e793b7c..a059cad1 100644 --- a/.github/actions/pip_installation/action.yml +++ b/.github/actions/pip_installation/action.yml @@ -14,8 +14,18 @@ runs: - name: Install dependencies shell: bash run: | - export CC="mpicc" HDF5_MPI="ON" - python -m pip install -r requirements.txt - python -m pip install numba - python -m pip install pythran - + export CC="mpicc" HDF5_MPI="ON" + python -m pip install -r requirements.txt + python -m pip install numba + python -m pip install pythran + - uses: actions/checkout@v5 + with: + repository: EmilyBourne/hptt + path: hptt + - name: Install hptt + shell: bash + run: | + cd hptt + make -j + cd pythonAPI + python -m pip install . diff --git a/fullSimulation.py b/fullSimulation.py index 8e9989c6..97024b65 100644 --- a/fullSimulation.py +++ b/fullSimulation.py @@ -218,6 +218,12 @@ def my_print(rank, *args, **kwargs): average_output = 0 startPrint = max(0, ti % saveStep) timeForLoop = True + + timeQN = 0 + timeFluxAdv = 0 + timevParAdv = 0 + timePolAdv = 0 + while (ti < tN and timeForLoop): full_loop_start = time.time() @@ -228,49 +234,77 @@ def my_print(rank, *args, **kwargs): # Compute f^n+1/2 using Lie splitting distribFunc.setLayout('flux_surface') distribFunc.saveGridValues() + time_start = time.time() fluxAdv.gridStep(distribFunc) + timeFluxAdv += (time.time()-time_start) distribFunc.setLayout('v_parallel') phi.setLayout('v_parallel_1d') + time_start = time.time() vParAdv.gridStep(distribFunc, phi, parGrad, parGradVals, halfStep) + timevParAdv += (time.time()-time_start) distribFunc.setLayout('poloidal') phi.setLayout('poloidal') + time_start = time.time() polAdv.gridStep(distribFunc, phi, halfStep) + timePolAdv += (time.time()-time_start) # Find phi from f^n+1/2 by solving QN eq again distribFunc.setLayout('v_parallel') + time_start = time.time() density.getPerturbedRho(distribFunc, rho) + timeQN += (time.time()-time_start) QNSolver.getModes(rho) rho.setLayout('mode_solve') phi.setLayout('mode_solve') + time_start = time.time() QNSolver.solveEquation(phi, rho) + timeQN += (time.time()-time_start) phi.setLayout('v_parallel_2d') rho.setLayout('v_parallel_2d') + time_start = time.time() QNSolver.findPotential(phi) + timeQN += (time.time()-time_start) # Compute f^n+1 using strang splitting distribFunc.restoreGridValues() # restored from flux_surface layout + time_start = time.time() fluxAdv.gridStep(distribFunc) + timeFluxAdv += (time.time()-time_start) distribFunc.setLayout('v_parallel') phi.setLayout('v_parallel_1d') + time_start = time.time() vParAdv.gridStep(distribFunc, phi, parGrad, parGradVals, halfStep) + timevParAdv += (time.time()-time_start) distribFunc.setLayout('poloidal') phi.setLayout('poloidal') + time_start = time.time() polAdv.gridStep(distribFunc, phi, fullStep) + timePolAdv += (time.time()-time_start) distribFunc.setLayout('v_parallel') + time_start = time.time() vParAdv.gridStepKeepGradient(distribFunc, parGradVals, halfStep) + timevParAdv += (time.time()-time_start) distribFunc.setLayout('flux_surface') + time_start = time.time() fluxAdv.gridStep(distribFunc) + timeFluxAdv += (time.time()-time_start) # Find phi from f^n by solving QN eq distribFunc.setLayout('v_parallel') + time_start = time.time() density.getPerturbedRho(distribFunc, rho) + timeQN += (time.time()-time_start) QNSolver.getModes(rho) rho.setLayout('mode_solve') phi.setLayout('mode_solve') + time_start = time.time() QNSolver.solveEquation(phi, rho) + timeQN += (time.time()-time_start) phi.setLayout('v_parallel_2d') rho.setLayout('v_parallel_2d') + time_start = time.time() QNSolver.findPotential(phi) + timeQN += (time.time()-time_start) diagnostic_start = time.time() # Calculate diagnostic quantities @@ -301,8 +335,6 @@ def my_print(rank, *args, **kwargs): timeForLoop = comm.allreduce((time.time( ) - setup_time_start + 2*average_loop + 2*average_output) < stopTime, op=MPI.LAND) - full_loop_time += (time.time()-full_loop_start) - output_start = time.time() if (ti % saveStep != 0): @@ -327,15 +359,31 @@ def my_print(rank, *args, **kwargs): print(s.getvalue(), file=open("profile/l2Test{}.txt".format(rank), "w")) if (rank == 0): - if (not os.path.isdir("timing")): - os.mkdir("timing") + if (not os.path.isdir(f"{foldername}/timing")): + os.mkdir(f"{foldername}/timing") MPI.COMM_WORLD.Barrier() - print("{loop:16.10e} {output:16.10e} {setup:16.10e} {diagnostic:16.10e}". - format(loop=full_loop_time, output=output_time, setup=setup_time, - diagnostic=diagnostic_time), - file=open("timing/{}_l2Test{}.txt".format(MPI.COMM_WORLD.Get_size(), rank), "w")) + mpi_time = distribFunc._layout_manager._mpi_time + transp_time = distribFunc._layout_manager._transpose_time - mpi_time + + timeOther = full_loop_time - sum((timeQN, timeFluxAdv, timevParAdv, timePolAdv, transp_time, mpi_time)) + with open(f"{foldername}/timing/{MPI.COMM_WORLD.Get_size()}_l2Test{rank}.txt", "w") as timing_file: + for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, transp_time, mpi_time, timeOther, full_loop_time, output_time, setup_time, diagnostic_time): + print(f"{t:16.10e} ", end="", file=timing_file) + print(file=timing_file) + + if rank == 0: + print("Timings") + print("-------") + print("QN | Flux | vPar | Pol | MPI | transpOther | Total") + for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, mpi_time, transp_time, timeOther): + print(round(t / full_loop_time, 3), end=' | ') + print(1) + print() + for t in (timeQN, timeFluxAdv, timevParAdv, timePolAdv, timeOther): + print(t, end=' | ') + print(full_loop_time) if __name__ == "__main__": diff --git a/pygyro/model/grid.py b/pygyro/model/grid.py index 5c274a3c..e29c3f15 100644 --- a/pygyro/model/grid.py +++ b/pygyro/model/grid.py @@ -94,6 +94,16 @@ def getGlobalIndices(self, *indices: 'ints'): result[self._layout.dims_order[i]] = indices[i]+toAdd return result + def get3DSlice(self, *slices: 'ints'): + """ get the 3D slice at the provided list of coordinates + """ + assert len(slices) == self._nDims-3 + slices = slices + (slice(self._nGlobalCoords[self._layout.dims_order[-3]]), + slice( + self._nGlobalCoords[self._layout.dims_order[-2]]), + slice(self._nGlobalCoords[self._layout.dims_order[-1]])) + return self._f[tuple(slices)] + def get2DSlice(self, *slices: 'ints'): """ get the 2D slice at the provided list of coordinates """ diff --git a/pygyro/model/layout.py b/pygyro/model/layout.py index 738299d6..e43d6013 100644 --- a/pygyro/model/layout.py +++ b/pygyro/model/layout.py @@ -2,10 +2,19 @@ import numpy as np import warnings import operator +import time +import hptt from abc import ABC +def my_transpose(dest, source, axes): + if axes == list(range(len(axes))): + dest[:] = source + else: + hptt.tensorTransposeAndUpdate(axes, 1.0, source, 0.0, dest) + + class Layout: """ Layout: Class containing information about how to access data @@ -427,6 +436,8 @@ def __init__(self, comms: list, coords: list, layouts: dict, nprocs: list, eta_g self._shapes.append((name, new_layout.shape)) self._layouts = dict(layoutObjects) self.nLayouts = len(self._layouts) + self._mpi_time = 0 + self._transpose_time = 0 # Initialise the buffer size before the loop self._buffer_size = layoutObjects[0][1].size @@ -513,6 +524,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None): source and dest are assumed to not overlap in memory """ + start_time = time.time() # If this thread is only here for plotting purposes then ignore the command if (self._buffer_size == 0): return @@ -549,6 +561,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None): else: self._transposeRedirect_source_intact( source, dest, buf, source_name, dest_name) + self._transpose_time += (time.time() - start_time) def _transposeRedirect(self, source, dest, source_name, dest_name): """ @@ -644,7 +657,8 @@ def _transpose(self, source, dest, layout_source, layout_dest): transposition = [layout_source.dims_order.index( i) for i in layout_dest.dims_order] - destView[:] = np.transpose(sourceView, transposition) + my_transpose(destView, sourceView, transposition) + # destView[:] = np.transpose(sourceView, transposition) return @@ -662,8 +676,12 @@ def _transpose_source_intact(self, source, dest, buf, layout_source, layout_dest TODO """ # get views of the important parts of the data + assert source.flags['C_CONTIGUOUS'] or source.flags['F_CONTIGUOUS'] + assert np.split(source, [layout_source.size])[0].flags['C_CONTIGUOUS'] or np.split( + source, [layout_source.size])[0].flags['F_CONTIGUOUS'] sourceView = np.split(source, [layout_source.size])[ 0].reshape(layout_source.shape) + assert sourceView.flags['C_CONTIGUOUS'] or sourceView.flags['F_CONTIGUOUS'] # get axis information axis = self._get_swap_axes(layout_source, layout_dest) @@ -674,7 +692,8 @@ def _transpose_source_intact(self, source, dest, buf, layout_source, layout_dest 0].reshape(layout_dest .shape) transposition = [layout_source.dims_order.index( i) for i in layout_dest.dims_order] - dest[:] = np.transpose(sourceView, transposition) + my_transpose(dest, sourceView, transposition) + # dest[:] = np.transpose(sourceView, transposition) return # carry out transpose @@ -756,12 +775,15 @@ def _extract_from_source(source, tobuffer, layout_source: Layout, # the size of the block # The data should however be written directly in the buffer # as the shapes agree - arrView[:] = source[tuple(source_range)].transpose(order) + dest_block = np.empty(arrView.shape, dtype=arrView.dtype) + my_transpose(dest_block, np.ascontiguousarray( + source[tuple(source_range)]), order) + arrView[:] = dest_block + # arrView[:] = source[tuple(source_range)].transpose(order) start += size - @staticmethod - def _rearrange_from_buffer(data, buf, layout_source: Layout, + def _rearrange_from_buffer(self, data, buf, layout_source: Layout, layout_dest: Layout, axis: list, comm: MPI.Comm): """ Swap the axes of the blocks @@ -785,7 +807,9 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout, rcvBuf = np.split(buf, [size], axis=0)[0] assert rcvBuf.base is buf + start_time = time.time() comm.Alltoall(sendBuf, rcvBuf) + self._mpi_time += time.time() - start_time source_order = list(layout_source.dims_order) @@ -810,7 +834,8 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout, if (layout_dest.shape[axis[2]] % mpi_size == 0 and layout_source.shape[axis[1]] % mpi_size == 0): # If all blocks are the same shape with no padding then the # transposition can be carried out directly - destView[:] = np.transpose(bufView, transposition) + # destView[:] = np.transpose(bufView, transposition) + my_transpose(destView, bufView, transposition) else: for r in range(mpi_size): @@ -831,8 +856,13 @@ def _rearrange_from_buffer(data, buf, layout_source: Layout, # Transpose the data. As the axes to be concatenated are the first dimension # the concatenation is done automatically - destView[tuple(destRanges)] = np.transpose( - bufView[tuple(bufRanges)], transposition) + # destView[tuple(destRanges)] = np.transpose( + # bufView[tuple(bufRanges)], transposition) + dest_block = np.empty( + destView[tuple(destRanges)].shape, dtype=destView.dtype) + my_transpose(dest_block, np.ascontiguousarray( + bufView[tuple(bufRanges)]), transposition) + destView[tuple(destRanges)] = dest_block def compatible(self, l1: Layout, l2: Layout): """ @@ -932,6 +962,7 @@ def __init__(self, comm: MPI.Comm, layouts: list, nprocs: list, eta_grids: list, enumerate(self._nDims), key=operator.itemgetter(1)) self._totProcs = np.prod(nprocs[self._largestLayoutManager]) + self._transpose_time = 0 # Get a list of the distribution pattern for each layout type self._nprocs = [] @@ -1240,6 +1271,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None): source and dest are assumed to not overlap in memory """ + start_time = time.time() # If this thread is only here for plotting purposes then ignore the command if (self._buffer_size == 0): return @@ -1276,6 +1308,7 @@ def transpose(self, source, dest, source_name, dest_name, buf=None): self._transposeRedirect_source_intact( source, dest, buf, source_name, dest_name) self._current_manager = self._managers[self._handlers[dest_name]] + self._transpose_time += (time.time() - start_time) def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout): """ @@ -1300,7 +1333,8 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout): i) for i in layout_dest.dims_order] # Copy the relevant information - destView[:] = np.transpose(sourceView, transposition) + # destView[:] = np.transpose(sourceView, transposition) + my_transpose(destView, sourceView, transposition) elif (dest_ndims > source_ndims): # If the source has fewer dimensions then the layout was not @@ -1329,8 +1363,10 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout): i) for i in layout_dest.dims_order] # Copy the relevant information - destView[:] = np.transpose( - sourceView[tuple(sourceSlice)], transposition) + # destView[:] = np.transpose( + # sourceView[tuple(sourceSlice)], transposition) + my_transpose(destView, np.ascontiguousarray( + sourceView[tuple(sourceSlice)]), transposition) else: # Find the axis which will be distributed @@ -1379,7 +1415,11 @@ def _transpose(self, source, dest, layout_source: Layout, layout_dest: Layout): block = np.split(b, [blockSize])[0].reshape(blockShape) # Copy the block into the correct part of the memory - destView[tuple(slices)] = np.transpose(block, transposition) + # destView[tuple(slices)] = np.transpose(block, transposition) + dest_block = np.empty( + destView[tuple(slices)].shape, dtype=destView.dtype) + my_transpose(dest_block, block, transposition) + destView[tuple(slices)] = dest_block # The data now resides on the wrong memory chunk and must be copied dest[:] = source[:] @@ -1407,7 +1447,8 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay i) for i in layout_dest.dims_order] # Copy the relevant information - destView[:] = np.transpose(sourceView, transposition) + # destView[:] = np.transpose(sourceView, transposition) + my_transpose(destView, sourceView, transposition) elif (dest_ndims > source_ndims): # If the source has fewer dimensions then the layout was not @@ -1436,8 +1477,10 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay i) for i in layout_dest.dims_order] # Copy the relevant information - destView[:] = np.transpose( - sourceView[tuple(sourceSlice)], transposition) + # destView[:] = np.transpose( + # sourceView[tuple(sourceSlice)], transposition) + my_transpose(destView, np.ascontiguousarray( + sourceView[tuple(sourceSlice)]), transposition) else: # Find the axis which will be distributed idx_d, idx_s = self.getAxes(layout_dest, layout_source) @@ -1485,7 +1528,11 @@ def _transpose_source_intact(self, source, dest, buf, layout_source: Layout, lay block = np.split(b, [blockSize])[0].reshape(blockShape) # Copy the block into the correct part of the memory - destView[tuple(slices)] = np.transpose(block, transposition) + # destView[tuple(slices)] = np.transpose(block, transposition) + dest_block = np.empty( + destView[tuple(slices)].shape, dtype=destView.dtype) + my_transpose(dest_block, block, transposition) + destView[tuple(slices)] = dest_block def _transposeRedirect(self, source, dest, source_name, dest_name): """ diff --git a/pygyro/model/test_layout.py b/pygyro/model/test_layout.py index 1046adc0..835c5b8a 100644 --- a/pygyro/model/test_layout.py +++ b/pygyro/model/test_layout.py @@ -126,11 +126,11 @@ def test_OddLayoutPaths(): assert layout1.name == '0123' layout2 = remapper.getLayout('1230') - fStart = np.empty(remapper.bufferSize, int) + fStart = np.empty(remapper.bufferSize, complex) fStart[:] = -1 - fEnd = np.empty(remapper.bufferSize, int) + fEnd = np.empty(remapper.bufferSize, complex) fEnd[:] = -1 - fBuf = np.empty(remapper.bufferSize, int) + fBuf = np.empty(remapper.bufferSize, complex) fBuf[:] = -1 f1_s = np.split(fStart, [layout1.size])[0].reshape(layout1.shape) f2_s = np.split(fStart, [layout2.size])[0].reshape(layout2.shape) diff --git a/pygyro/tools/getTimings.py b/pygyro/tools/getTimings.py index 954fabed..c420352a 100644 --- a/pygyro/tools/getTimings.py +++ b/pygyro/tools/getTimings.py @@ -8,24 +8,30 @@ args = parser.parse_args() nprocs = args.nprocs[0] +steps = np.empty((nprocs, 7)) loop = np.empty(nprocs) output = np.empty(nprocs) setup = np.empty(nprocs) additional = np.empty(nprocs) for i in range(nprocs): - file_object = open("timing/{}_l2Test{}.txt".format(nprocs, i), "r") - loop[i] = float(file_object.read(19)) - output[i] = float(file_object.read(19)) - setup[i] = float(file_object.read(19)) - additional[i] = float(file_object.read(16)) - file_object.close() + with open(f"timing/{nprocs}_l2Test{i}.txt", "r") as file_object: + for j in range(7): + steps[i, j] = float(file_object.read(19)) + loop[i] = float(file_object.read(19)) + output[i] = float(file_object.read(19)) + setup[i] = float(file_object.read(19)) + additional[i] = float(file_object.read(16)) loop_time = max(loop) output_time = max(output) setup_time = max(setup) additional_time = max(additional) -file_object = open("results.txt", "a") -file_object.write("{nprocs:8} {loop:16.10e} {output:16.10e} {setup:16.10e} {additional:16.10e}\n" - .format(nprocs=nprocs, loop=loop_time, output=output_time, setup=setup_time, additional=additional_time)) +with open("results.txt", "a") as file_object: + file_object.write(f"{nprocs:8} ") + for j in range(7): + file_object.write(f"{max(steps[:, j]):8} ") + file_object.write( + f"{loop_time:16.10e} {output_time:16.10e} {setup_time:16.10e} {additional_time:16.10e}\n") + # file_object.write(f"{nprocs:8} {loop_time:16.10e} {output_time:16.10e} {setup_time:16.10e} {additional_time:16.10e}\n") diff --git a/requirements.txt b/requirements.txt index b0dbd305..0575a8c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ pytest>=2.8.7 mpi4py>=3.0.0 h5py>=2.8.0 git+https://github.com/pyccel/pyccel.git@supercomputing +#git+https://github.com/EmilyBourne/hptt.git@master #pyccel>=2.0.0 # h5py must be built from source using MPI compiler