Skip to content

Commit b3f5156

Browse files
author
Marius Meyer
committed
Merge branch 'intel-pacsvm' into 'master'
Add Intel SVM support to all benchmarks See merge request pc2/HPCC_FPGA!14
2 parents ea9d696 + 48d5eed commit b3f5156

33 files changed

+607
-140
lines changed

FFT/src/common/parameters.h.in

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#define LOG_FFT_SIZE @LOG_FFT_SIZE@
2020
#define FFT_UNROLL @FFT_UNROLL@
2121

22+
#cmakedefine USE_SVM
2223
/*
2324
Short description of the program.
2425
Moreover the version and build time is also compiled into the description.

FFT/src/host/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ if (INTELFPGAOPENCL_FOUND)
1313
target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}")
1414
target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base)
1515
target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel)
16+
if (USE_SVM)
17+
target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0)
18+
endif()
1619
target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
1720
target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
1821
add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)

FFT/src/host/execution.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ simple exchange of the different calculation methods.
4747
@return The resulting matrix
4848
*/
4949
std::unique_ptr<fft::FFTExecutionTimings>
50-
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, unsigned iterations, bool inverse);
50+
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, std::complex<HOST_DATA_TYPE>* data_out, unsigned iterations, bool inverse);
5151

5252
} // namespace bm_execution
5353

FFT/src/host/execution_default.cpp

+32-5
Original file line numberDiff line numberDiff line change
@@ -42,26 +42,45 @@ namespace bm_execution {
4242
std::unique_ptr<fft::FFTExecutionTimings>
4343
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config,
4444
std::complex<HOST_DATA_TYPE>* data,
45+
std::complex<HOST_DATA_TYPE>* data_out,
4546
unsigned iterations,
4647
bool inverse) {
4748

4849
cl::Buffer inBuffer = cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE));
4950
cl::Buffer outBuffer = cl::Buffer(*config.context, CL_MEM_READ_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE));
5051

5152
cl::Kernel fetchKernel(*config.program, FETCH_KERNEL_NAME);
52-
53-
fetchKernel.setArg(0, inBuffer);
54-
5553
cl::Kernel fftKernel(*config.program, FFT_KERNEL_NAME);
5654

55+
#ifdef USE_SVM
56+
clSetKernelArgSVMPointer(fetchKernel(), 0,
57+
reinterpret_cast<void*>(data));
58+
clSetKernelArgSVMPointer(fftKernel(), 0,
59+
reinterpret_cast<void*>(data_out));
60+
#else
61+
fetchKernel.setArg(0, inBuffer);
5762
fftKernel.setArg(0, outBuffer);
63+
#endif
5864
fftKernel.setArg(1, iterations);
5965
fftKernel.setArg(2, static_cast<cl_int>(inverse));
6066

6167
cl::CommandQueue fetchQueue(*config.context);
6268
cl::CommandQueue fftQueue(*config.context);
6369

70+
#ifdef USE_SVM
71+
clEnqueueSVMMap(fetchQueue(), CL_TRUE,
72+
CL_MAP_READ,
73+
reinterpret_cast<void *>(data),
74+
(1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0,
75+
NULL, NULL);
76+
clEnqueueSVMMap(fftQueue(), CL_TRUE,
77+
CL_MAP_WRITE,
78+
reinterpret_cast<void *>(data_out),
79+
(1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0,
80+
NULL, NULL);
81+
#else
6482
fetchQueue.enqueueWriteBuffer(inBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data);
83+
#endif
6584

6685
std::vector<double> calculationTimings;
6786
for (uint r =0; r < config.programSettings->numRepetitions; r++) {
@@ -77,8 +96,16 @@ namespace bm_execution {
7796
(endCalculation - startCalculation);
7897
calculationTimings.push_back(calculationTime.count());
7998
}
80-
81-
fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data);
99+
#ifdef USE_SVM
100+
clEnqueueSVMUnmap(fetchQueue(),
101+
reinterpret_cast<void *>(data), 0,
102+
NULL, NULL);
103+
clEnqueueSVMUnmap(fftQueue(),
104+
reinterpret_cast<void *>(data_out), 0,
105+
NULL, NULL);
106+
#else
107+
fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data_out);
108+
#endif
82109

83110
std::unique_ptr<fft::FFTExecutionTimings> result(new fft::FFTExecutionTimings{
84111
calculationTimings

FFT/src/host/fft_benchmark.cpp

+32-7
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,30 @@ fft::FFTProgramSettings::getSettingsMap() {
4747
return map;
4848
}
4949

50+
fft::FFTData::FFTData(cl::Context context, uint iterations) : context(context) {
51+
#ifdef USE_SVM
52+
data = reinterpret_cast<std::complex<HOST_DATA_TYPE>*>(
53+
clSVMAlloc(context(), 0 ,
54+
iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>), 1024));
55+
data_out = reinterpret_cast<std::complex<HOST_DATA_TYPE>*>(
56+
clSVMAlloc(context(), 0 ,
57+
iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>), 1024));
58+
#else
59+
posix_memalign(reinterpret_cast<void**>(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
60+
posix_memalign(reinterpret_cast<void**>(&data_out), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
61+
#endif
62+
}
63+
64+
fft::FFTData::~FFTData() {
65+
#ifdef USE_SVM
66+
clSVMFree(context(), reinterpret_cast<void*>(data));
67+
clSVMFree(context(), reinterpret_cast<void*>(data_out));
68+
#else
69+
free(data);
70+
free(data_out);
71+
#endif
72+
}
73+
5074
fft::FFTBenchmark::FFTBenchmark(int argc, char* argv[]) {
5175
setupBenchmark(argc, argv);
5276
}
@@ -63,7 +87,7 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
6387

6488
std::unique_ptr<fft::FFTExecutionTimings>
6589
fft::FFTBenchmark::executeKernel(FFTData &data) {
66-
return bm_execution::calculate(*executionSettings, data.data,executionSettings->programSettings->iterations,
90+
return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
6791
executionSettings->programSettings->inverse);
6892
}
6993

@@ -85,33 +109,34 @@ fft::FFTBenchmark::printResults(const fft::FFTExecutionTimings &output) {
85109

86110
std::unique_ptr<fft::FFTData>
87111
fft::FFTBenchmark::generateInputData() {
88-
auto d = std::unique_ptr<fft::FFTData>(new fft::FFTData(executionSettings->programSettings->iterations));
112+
auto d = std::unique_ptr<fft::FFTData>(new fft::FFTData(*executionSettings->context, executionSettings->programSettings->iterations));
89113
std::mt19937 gen(0);
90114
auto dis = std::uniform_real_distribution<HOST_DATA_TYPE>(-1.0, 1.0);
91115
for (int i=0; i< executionSettings->programSettings->iterations * (1 << LOG_FFT_SIZE); i++) {
92116
d->data[i].real(dis(gen));
93117
d->data[i].imag(dis(gen));
118+
d->data_out[i].real(0.0);
119+
d->data_out[i].imag(0.0);
94120
}
95121
return d;
96122
}
97123

98124
bool
99125
fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
100-
auto verify_data = generateInputData();
101126
double residual_max = 0;
102127
for (int i = 0; i < executionSettings->programSettings->iterations; i++) {
103128
// we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order.
104129
// Directly applying iFFT on the data would thus not form the identity function we want to have for verification.
105130
// TODO: This might need to be changed for other FPGA implementations that return the data in correct order
106-
fft::bit_reverse(&data.data[i * (1 << LOG_FFT_SIZE)], 1);
107-
fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data[i * (1 << LOG_FFT_SIZE)]);
131+
fft::bit_reverse(&data.data_out[i * (1 << LOG_FFT_SIZE)], 1);
132+
fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data_out[i * (1 << LOG_FFT_SIZE)]);
108133

109134
// Normalize the data after applying iFFT
110135
for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) {
111-
data.data[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE);
136+
data.data_out[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE);
112137
}
113138
for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) {
114-
double tmp_error = std::abs(verify_data->data[i * (1 << LOG_FFT_SIZE) + j] - data.data[i * (1 << LOG_FFT_SIZE) + j]);
139+
double tmp_error = std::abs(data.data[i * (1 << LOG_FFT_SIZE) + j] - data.data_out[i * (1 << LOG_FFT_SIZE) + j]);
115140
residual_max = residual_max > tmp_error ? residual_max : tmp_error;
116141
}
117142
}

FFT/src/host/fft_benchmark.hpp

+16-7
Original file line numberDiff line numberDiff line change
@@ -80,27 +80,36 @@ class FFTData {
8080
public:
8181

8282
/**
83-
* @brief The data array used ofr the FFT calculation
83+
* @brief The data array used as input of the FFT calculation
8484
*
8585
*/
8686
std::complex<HOST_DATA_TYPE>* data;
8787

88+
/**
89+
* @brief The data array used as output of the FFT calculation
90+
*
91+
*/
92+
std::complex<HOST_DATA_TYPE>* data_out;
93+
94+
/**
95+
* @brief The context that is used to allocate memory in SVM mode
96+
*
97+
*/
98+
cl::Context context;
99+
88100
/**
89101
* @brief Construct a new FFT Data object
90102
*
103+
* @param context The OpenCL context used to allocate memory in SVM mode
91104
* @param iterations Number of FFT data that will be stored sequentially in the array
92105
*/
93-
FFTData(uint iterations) {
94-
posix_memalign(reinterpret_cast<void**>(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
95-
}
106+
FFTData(cl::Context context, uint iterations);
96107

97108
/**
98109
* @brief Destroy the FFT Data object. Free the allocated memory
99110
*
100111
*/
101-
~FFTData() {
102-
free(data);
103-
}
112+
~FFTData();
104113

105114
};
106115

FFT/tests/test_execution_functionality.cpp

+23-17
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) {
5858
}
5959
auto result = bm->executeKernel(*data);
6060
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
61-
EXPECT_FLOAT_EQ(std::abs(data->data[i]), 0.0);
61+
EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0);
6262
}
6363
}
6464

@@ -72,11 +72,11 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) {
7272
data->data[i].imag(1.0);
7373
}
7474
auto result = bm->executeKernel(*data);
75-
EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
76-
EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
75+
EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
76+
EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
7777
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
78-
EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
79-
EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
78+
EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001);
79+
EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001);
8080
}
8181
}
8282

@@ -90,11 +90,11 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
9090
data->data[i].imag(0.0);
9191
}
9292
auto result = bm->executeKernel(*data);
93-
EXPECT_NEAR(data->data[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
94-
EXPECT_NEAR(data->data[0].imag(), 0.0, 0.00001);
93+
EXPECT_NEAR(data->data_out[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
94+
EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001);
9595
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
96-
EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
97-
EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
96+
EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001);
97+
EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001);
9898
}
9999
}
100100

@@ -108,18 +108,24 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
108108

109109
// Normalize iFFT result
110110
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
111-
data->data[i] /= (1 << LOG_FFT_SIZE);
111+
data->data_out[i] /= (1 << LOG_FFT_SIZE);
112112
}
113113

114114
// Need to again bit reverse input for iFFT
115-
fft::bit_reverse(data->data, 1);
115+
fft::bit_reverse(data->data_out, 1);
116+
117+
// Copy to input buffer for iFFT
118+
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
119+
data->data[i] = data->data_out[i];
120+
}
121+
116122
bm->getExecutionSettings().programSettings->inverse = true;
117123
auto result2 = bm->executeKernel(*data);
118124
// Since data was already sorted by iFFT the bit reversal of the kernel has t be undone
119-
fft::bit_reverse(data->data, 1);
125+
fft::bit_reverse(data->data_out, 1);
120126

121127
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
122-
EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001);
128+
EXPECT_NEAR(std::abs(data->data_out[i]), std::abs(verify_data->data[i]), 0.001);
123129
}
124130
}
125131

@@ -136,10 +142,10 @@ TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) {
136142

137143
// Normalize iFFT result
138144
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
139-
data->data[i] -= verify_data->data[i];
145+
data->data_out[i] -= verify_data->data[i];
140146
}
141147
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
142-
EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001);
148+
EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001);
143149
}
144150
}
145151

@@ -157,9 +163,9 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) {
157163

158164
// Normalize iFFT result
159165
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
160-
data->data[i] -= verify_data->data[i];
166+
data->data_out[i] -= verify_data->data[i];
161167
}
162168
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
163-
EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001);
169+
EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001);
164170
}
165171
}

GEMM/src/common/parameters.h.in

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
1919
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
2020

21+
#cmakedefine USE_SVM
22+
2123
/*
2224
Short description of the program
2325
*/

0 commit comments

Comments
 (0)