Skip to content

Commit 48e0386

Browse files
committed
Merge branch 'hpl-dp' into 'master'
Fix HPL to support DP FP See merge request pc2/HPCC_FPGA!52
2 parents ff640b0 + 62097c4 commit 48e0386

14 files changed

+127
-49
lines changed

.gitlab-ci.yml

+42
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,28 @@ build:LINPACK:
152152
- cmake/**/*
153153
- .gitlab-ci.yml
154154

155+
156+
build:LINPACK_DP:
157+
stage: build
158+
script:
159+
- rm -rf build
160+
- mkdir -p build
161+
- cd build
162+
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
163+
- make -j 40 all
164+
artifacts:
165+
paths:
166+
- build/bin/hpl_torus_PCIE_emulate.aocx
167+
- build/bin/hpl_torus_IEC_emulate.aocx
168+
- build/bin/Linpack_intel
169+
- build/bin/Linpack_test_intel
170+
only:
171+
changes:
172+
- LINPACK/**/*
173+
- shared/**/*
174+
- scripts/**/*
175+
- cmake/**/*
176+
155177
build:GEMM:
156178
stage: build
157179
script:
@@ -420,6 +442,26 @@ test:LINPACK:
420442
- .gitlab-ci.yml
421443
needs: ["build:LINPACK"]
422444

445+
test:LINPACK_DP:
446+
stage: test
447+
script:
448+
- cd build
449+
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
450+
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
451+
dependencies:
452+
- build:LINPACK_DP
453+
artifacts:
454+
when: on_failure
455+
paths:
456+
- build/Testing/Temporary/LastTest.log
457+
only:
458+
changes:
459+
- LINPACK/**/*
460+
- shared/**/*
461+
- scripts/**/*
462+
- cmake/**/*
463+
needs: ["build:LINPACK_DP"]
464+
423465
test:GEMM:
424466
stage: test
425467
script:

LINPACK/CHANGELOG

+5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
This file contains all changes made to the source code for each release.
44

5+
6+
## 2.4
7+
#### Added:
8+
- Support for double-precision floating-point
9+
510
## 2.3
611
#### Changed:
712
- Refactored the code to support different execution kernels and data distributions

LINPACK/CMakeLists.txt

+1-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
cmake_minimum_required(VERSION 3.1)
2-
project(LINPACK VERSION 2.3)
2+
project(LINPACK VERSION 2.4)
33

44
set(USE_DEPRECATED_HPP_HEADER No)
55

@@ -21,14 +21,6 @@ if (TEST_EMULATION)
2121
set(TEST_HOST_FLAGS "--emulation")
2222
endif()
2323

24-
set(DATA_TYPE float)
25-
if (DATA_TYPE STREQUAL "double")
26-
set(_DP Yes)
27-
message(STATUS "Set DP flag since data type seems to be double precision")
28-
else()
29-
set(_DP No)
30-
endif()
31-
3224
set(USE_OPENMP Yes)
3325
set(USE_MPI Yes)
3426

LINPACK/Readme.md

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Name | Default | Description |
5151
`DEFAULT_MATRIX_SIZE`| 1024 | Width and heigth of the input matrix |
5252
`REGISTER_BLOCK_LOG`| 3 | Size of the blocks that will be processed in registers (2^3=8 is the default) |
5353
`LOCAL_MEM_BLOCK_LOG`| 5 | Size of the blocks that will be processed in local memory (2^3=8 is the default) |
54+
`DATA_TYPE` | float | Used data type. Can be `float` or `double` |
5455

5556
Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
5657
of the Intel FPGA SDK installation.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# This file contains the default configuration for the Nallatech 520N board
2+
# for the use with single precision floating point values.
3+
# To use this configuration file, call cmake with the parameter
4+
#
5+
# cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
6+
#
7+
8+
9+
set(USE_MPI Yes CACHE BOOL "" FORCE)
10+
set(USE_SVM No CACHE BOOL "" FORCE)
11+
set(USE_HBM No CACHE BOOL "" FORCE)
12+
set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
13+
set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
14+
15+
set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE)
16+
17+
# LINPACK specific options
18+
set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
19+
set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
20+
set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
21+
set(NUM_REPLICATIONS 5 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
22+

LINPACK/src/common/parameters.h.in

+7-2
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,18 @@
1010
#define DEFAULT_DEVICE @DEFAULT_DEVICE@
1111
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
1212
#define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
13-
#cmakedefine _DP @_DP@
13+
#cmakedefine _DP
14+
15+
#ifdef _DP
16+
#define MPI_DATA_TYPE MPI_DOUBLE
17+
#else
18+
#define MPI_DATA_TYPE MPI_FLOAT
19+
#endif
1420

1521
/**
1622
* Device specific parameters
1723
*/
1824
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
19-
#define GLOBAL_MEM_UNROLL @GLOBAL_MEM_UNROLL@
2025
#define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
2126
#define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
2227
#define NUM_REPLICATIONS @NUM_REPLICATIONS@

LINPACK/src/device/hpl_torus_IEC.cl

+6-6
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
404404
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
405405
for (int ii =0; ii < GEMM_BLOCK; ii++) {
406406
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
407-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
407+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
408408
for (int jj =0; jj < GEMM_BLOCK; jj++) {
409409
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
410410
}
@@ -569,7 +569,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
569569
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
570570
for (int ii =0; ii < GEMM_BLOCK; ii++) {
571571
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
572-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
572+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
573573
for (int jj =0; jj < GEMM_BLOCK; jj++) {
574574
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
575575
}
@@ -600,7 +600,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
600600
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
601601
for (int ii =0; ii < GEMM_BLOCK; ii++) {
602602
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
603-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
603+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
604604
for (int jj =0; jj < GEMM_BLOCK; jj++) {
605605
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
606606
}
@@ -709,7 +709,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
709709
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
710710
for (int ii =0; ii < GEMM_BLOCK; ii++) {
711711
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
712-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
712+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
713713
for (int jj =0; jj < GEMM_BLOCK; jj++) {
714714
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
715715
}
@@ -739,7 +739,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
739739
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
740740
for (int ii =0; ii < GEMM_BLOCK; ii++) {
741741
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
742-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
742+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
743743
for (int jj =0; jj < GEMM_BLOCK; jj++) {
744744
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
745745
}
@@ -829,7 +829,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
829829
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
830830
for (int ii =0; ii < GEMM_BLOCK; ii++) {
831831
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
832-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
832+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
833833
for (int jj =0; jj < GEMM_BLOCK; jj++) {
834834
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
835835
}

LINPACK/src/device/hpl_torus_PCIE.cl

+10-10
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
229229
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
230230
for (int ii =0; ii < GEMM_BLOCK; ii++) {
231231
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
232-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
232+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
233233
for (int jj =0; jj < GEMM_BLOCK; jj++) {
234234
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
235235
}
@@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
411411
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
412412
for (int ii =0; ii < GEMM_BLOCK; ii++) {
413413
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
414-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
414+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
415415
for (int jj =0; jj < GEMM_BLOCK; jj++) {
416416
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
417417
}
@@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
423423
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
424424
for (int ii =0; ii < GEMM_BLOCK; ii++) {
425425
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
426-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
426+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
427427
for (int jj =0; jj < GEMM_BLOCK; jj++) {
428428
a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
429429
}
@@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
434434
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
435435
for (int ii =0; ii < GEMM_BLOCK; ii++) {
436436
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
437-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
437+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
438438
for (int jj =0; jj < GEMM_BLOCK; jj++) {
439439
a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
440440
}
@@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
466466
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
467467
for (int ii =0; ii < GEMM_BLOCK; ii++) {
468468
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
469-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
469+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
470470
for (int jj =0; jj < GEMM_BLOCK; jj++) {
471471
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
472472
}
@@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
558558
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
559559
for (int ii =0; ii < GEMM_BLOCK; ii++) {
560560
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
561-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
561+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
562562
for (int jj =0; jj < GEMM_BLOCK; jj++) {
563563
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
564564
}
@@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
570570
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
571571
for (int ii =0; ii < GEMM_BLOCK; ii++) {
572572
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
573-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
573+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
574574
for (int jj =0; jj < GEMM_BLOCK; jj++) {
575575
top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
576576
}
@@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
601601
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
602602
for (int ii =0; ii < GEMM_BLOCK; ii++) {
603603
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
604-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
604+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
605605
for (int jj =0; jj < GEMM_BLOCK; jj++) {
606606
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
607607
}
@@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
684684
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
685685
for (int ii =0; ii < GEMM_BLOCK; ii++) {
686686
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
687-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
687+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
688688
for (int jj =0; jj < GEMM_BLOCK; jj++) {
689689
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
690690
}
@@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
697697
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
698698
for (int ii =0; ii < GEMM_BLOCK; ii++) {
699699
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
700-
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
700+
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
701701
for (int jj =0; jj < GEMM_BLOCK; jj++) {
702702
left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
703703
}

LINPACK/src/host/execution_types/execution_pcie.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,9 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
225225
lu_queues.back().finish();
226226

227227
// Broadcast LU block in column to update all left blocks
228-
MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
228+
MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
229229
// Broadcast LU block in row to update all top blocks
230-
MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
230+
MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
231231

232232
if (num_top_blocks > 0) {
233233

@@ -329,10 +329,10 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
329329

330330
// Send the left and top blocks to all other ranks so they can be used to update all inner blocks
331331
for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) {
332-
MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
332+
MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
333333
}
334334
for (int tbi=0; tbi < blocks_per_row - local_block_row; tbi++) {
335-
MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
335+
MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
336336
}
337337

338338
// update all remaining inner blocks using only global memory

0 commit comments

Comments
 (0)