Skip to content

Commit 5174bae

Browse files
Added a class to check the timings of cuda kernels, started using it in the RapidHeightMapExtractorCUDA (#675)
* Added a class to check the timings of cuda kernels, started using it in the RapidHeightMapExtractorCUDA --------- Co-authored-by: ds58 <[email protected]> Co-authored-by: Dexton Anderson <[email protected]>
1 parent 1160ccd commit 5174bae

File tree

3 files changed

+143
-4
lines changed

3 files changed

+143
-4
lines changed

ihmc-perception/src/main/java/us/ihmc/perception/cuda/CUDAKernel.java

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package us.ihmc.perception.cuda;
22

3+
import org.bytedeco.cuda.cudart.CUevent_st;
34
import org.bytedeco.cuda.cudart.CUfunc_st;
45
import org.bytedeco.cuda.cudart.CUmod_st;
56
import org.bytedeco.cuda.cudart.CUstream_st;
@@ -11,29 +12,48 @@
1112
import org.bytedeco.javacpp.LongPointer;
1213
import org.bytedeco.javacpp.Pointer;
1314
import org.bytedeco.javacpp.PointerPointer;
15+
import us.ihmc.log.LogTools;
1416

1517
import java.util.ArrayList;
18+
import java.util.LinkedList;
1619
import java.util.List;
20+
import java.util.Optional;
1721

18-
import static org.bytedeco.cuda.global.cudart.cuLaunchKernel;
19-
import static org.bytedeco.cuda.global.cudart.cuModuleGetFunction;
22+
import static org.bytedeco.cuda.global.cudart.*;
2023
import static us.ihmc.perception.cuda.CUDATools.throwCUDAError;
2124

2225
@SuppressWarnings("resource")
2326
public class CUDAKernel implements AutoCloseable
2427
{
28+
private final String name;
2529
private final CUfunc_st kernelFunction = new CUfunc_st();
2630
private final List<Pointer> parameters = new ArrayList<>();
2731
private boolean retainParameters = false;
32+
private boolean enableKernelTimings = false;
33+
34+
private CUDAKernelTimings kernelTimings;
35+
private final CUevent_st start = new CUevent_st();
36+
private final CUevent_st end = new CUevent_st();
2837

2938
private int error;
3039

3140
public CUDAKernel(String name, CUmod_st kernelModule) throws Exception
3241
{
42+
this.name = name;
3343
error = cuModuleGetFunction(kernelFunction, kernelModule, name);
3444
throwCUDAError(error);
3545
}
3646

47+
/**
48+
* Setting this to true enables the ability to run timings on the specific kernel.
49+
* The timing checks perform synchronization calls.
50+
*/
51+
public void enableKernelTimings(boolean enableKernelTimings)
52+
{
53+
this.enableKernelTimings = enableKernelTimings;
54+
kernelTimings = new CUDAKernelTimings();
55+
}
56+
3757
public void retainParameters(boolean retainParameters)
3858
{
3959
this.retainParameters = retainParameters;
@@ -53,6 +73,13 @@ public void run(CUstream_st stream, dim3 gridSize, dim3 blockSize, int sharedMem
5373
for (int i = 0; i < parameters.size(); ++i)
5474
parametersPointer.put(i, parameters.get(i));
5575

76+
if (enableKernelTimings)
77+
{
78+
cudaEventCreate(start);
79+
cudaEventCreate(end);
80+
cudaEventRecord(start);
81+
}
82+
5683
error = cuLaunchKernel(kernelFunction,
5784
gridSize.x(),
5885
gridSize.y(),
@@ -64,6 +91,16 @@ public void run(CUstream_st stream, dim3 gridSize, dim3 blockSize, int sharedMem
6491
stream,
6592
parametersPointer,
6693
new PointerPointer<>());
94+
95+
if (enableKernelTimings)
96+
{
97+
cudaEventRecord(end);
98+
cudaEventSynchronize(end);
99+
100+
kernelTimings.addExecutionTime(start, end);
101+
kernelTimings.printTimesForKernel();
102+
}
103+
67104
CUDATools.checkCUDAError(error);
68105

69106
if (!retainParameters)
@@ -123,4 +160,96 @@ public void close()
123160
clearParameters();
124161
kernelFunction.close();
125162
}
163+
164+
/**
165+
* This class handles the kernel timings.
166+
* With options to compute the min/max, average, and variance of the dataset
167+
*/
168+
private class CUDAKernelTimings
169+
{
170+
private static final int MAX_ENTRIES = 250;
171+
private final LinkedList<Float> executionTimes = new LinkedList<>();
172+
173+
private void addExecutionTime(CUevent_st start, CUevent_st end)
174+
{
175+
float[] milliseconds = new float[1];
176+
milliseconds[0] = 0.0f;
177+
cudaEventElapsedTime(milliseconds, start, end);
178+
executionTimes.add(milliseconds[0]);
179+
180+
if (executionTimes.size() > MAX_ENTRIES)
181+
{
182+
executionTimes.pollFirst();
183+
}
184+
}
185+
186+
public double getAverageTime(String kernelName)
187+
{
188+
if (executionTimes.isEmpty())
189+
{
190+
LogTools.info("No recorded times for " + kernelName);
191+
return Float.NaN;
192+
}
193+
else
194+
{
195+
return executionTimes.stream().mapToDouble(Float::doubleValue).average().orElse(0.0);
196+
}
197+
}
198+
199+
public Float getMinTime(String kernelName)
200+
{
201+
if (executionTimes.isEmpty())
202+
{
203+
LogTools.info("No recorded times for " + kernelName);
204+
return Float.NaN;
205+
}
206+
Optional<Float> min = executionTimes.stream().min(Float::compareTo);
207+
return min.orElse(null);
208+
}
209+
210+
public Float getMaxTime(String kernelName)
211+
{
212+
if (executionTimes.isEmpty())
213+
{
214+
LogTools.info("No recorded times for " + kernelName);
215+
return Float.NaN;
216+
}
217+
218+
Optional<Float> max = executionTimes.stream().max(Float::compareTo);
219+
return max.orElse(null);
220+
}
221+
222+
public double getStandardDeviation(String kernelName)
223+
{
224+
if (executionTimes.isEmpty())
225+
{
226+
LogTools.info("No recorded times for " + kernelName);
227+
return Float.NaN;
228+
}
229+
230+
double average = executionTimes.stream().mapToDouble(Float::doubleValue).average().orElse(0.0);
231+
double variance = executionTimes.stream().mapToDouble(time -> Math.pow(time - average, 2)).average().orElse(0.0);
232+
return Math.sqrt(variance);
233+
}
234+
235+
public void printTimesForKernel()
236+
{
237+
if (executionTimes.isEmpty())
238+
{
239+
LogTools.info("No recorded times for " + CUDAKernel.this.name);
240+
}
241+
242+
double average = getAverageTime(CUDAKernel.this.name);
243+
double variance = getStandardDeviation(CUDAKernel.this.name);
244+
double min = getMinTime(CUDAKernel.this.name);
245+
double max = getMaxTime(CUDAKernel.this.name);
246+
247+
LogTools.info("Timings for kernel " + CUDAKernel.this.name + " in milliseconds!");
248+
LogTools.info("| Average time: " + average);
249+
LogTools.info("| Variance time: " + variance);
250+
LogTools.info("| Min time: " + min);
251+
LogTools.info("| Max time: " + max);
252+
LogTools.warn("******************************************");
253+
}
254+
}
126255
}

ihmc-perception/src/main/java/us/ihmc/perception/gpuHeightMap/RapidHeightMapExtractorCUDA.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@
2727

2828
import java.net.URL;
2929

30-
import static org.bytedeco.cuda.global.cudart.cudaFree;
31-
import static org.bytedeco.cuda.global.cudart.cudaStreamSynchronize;
30+
import static org.bytedeco.cuda.global.cudart.*;
3231

3332
public class RapidHeightMapExtractorCUDA implements RapidHeightMapExtractorInterface
3433
{
34+
private static final boolean PRINT_TIMING_FOR_KERNELS = false;
3535
static final int BLOCK_SIZE_XY = 32;
3636
static final HeightMapParameters heightMapParameters = new HeightMapParameters("GPU");
3737

@@ -122,6 +122,12 @@ public RapidHeightMapExtractorCUDA(ReferenceFrame leftFootSoleFrame,
122122
planOffsetKernel = heightMapCUDAProgram.loadKernel("planOffsetKernel");
123123
emptyRegisterKernel = heightMapCUDAProgram.loadKernel("heightMapRegistrationKernel");
124124

125+
updateKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
126+
registerKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
127+
croppingKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
128+
planOffsetKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
129+
emptyRegisterKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
130+
125131
// Initialize matrices and images
126132
localHeightMapImage = new GpuMat(localCellsPerAxis, localCellsPerAxis, opencv_core.CV_16UC1);
127133
globalHeightMapImage = new GpuMat(globalCellsPerAxis, globalCellsPerAxis, opencv_core.CV_16UC1);

ihmc-perception/src/main/java/us/ihmc/perception/gpuHeightMap/SnappingHeightMapExtractor.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626
public class SnappingHeightMapExtractor
2727
{
28+
private static final boolean PRINT_TIMING_FOR_KERNELS = false;
29+
2830
private final SteppableRegionCalculatorParameters steppableRegionParameters = new SteppableRegionCalculatorParameters();
2931

3032
private final TerrainMapData terrainMapData;
@@ -62,6 +64,8 @@ public SnappingHeightMapExtractor(TerrainMapData terrainMapData)
6264
snappingHeightMapProgram = new CUDAProgram(kernelPath, heightMapUtilsHeaderPath, mathUtilsHeaderPath);
6365
snappingKernel = snappingHeightMapProgram.loadKernel("computeSnappedValuesKernel");
6466

67+
snappingKernel.enableKernelTimings(PRINT_TIMING_FOR_KERNELS);
68+
6569
snappingParametersHostPointer = new FloatPointer(17);
6670
snappingParametersDevicePointer = new FloatPointer();
6771
}

0 commit comments

Comments
 (0)