@@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
229
229
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
230
230
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
231
231
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
232
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
232
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
233
233
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
234
234
a_buffer [i ][j ][ii ][jj ] = a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ];
235
235
}
@@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
411
411
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
412
412
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
413
413
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
414
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
414
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
415
415
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
416
416
a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ] = a_buffer [i ][j ][ii ][jj ];
417
417
}
@@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
423
423
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
424
424
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
425
425
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
426
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
426
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
427
427
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
428
428
a_block_trans [(i * GEMM_BLOCK + ii ) * BLOCK_SIZE + j * GEMM_BLOCK + jj ] = a_buffer [j ][i ][jj ][ii ];
429
429
}
@@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
434
434
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
435
435
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
436
436
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
437
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
437
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
438
438
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
439
439
a_block [(i * GEMM_BLOCK + ii ) * BLOCK_SIZE + j * GEMM_BLOCK + jj ] = a_buffer [i ][j ][ii ][jj ];
440
440
}
@@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
466
466
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
467
467
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
468
468
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
469
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
469
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
470
470
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
471
471
a_buffer [i ][j ][ii ][jj ] = a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ];
472
472
}
@@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
558
558
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
559
559
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
560
560
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
561
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
561
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
562
562
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
563
563
a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ] = a_buffer [i ][j ][ii ][jj ];
564
564
}
@@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
570
570
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
571
571
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
572
572
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
573
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
573
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
574
574
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
575
575
top_block [(i * GEMM_BLOCK + ii ) * BLOCK_SIZE + j * GEMM_BLOCK + jj ] = a_buffer [i ][j ][ii ][jj ];
576
576
}
@@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
601
601
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
602
602
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
603
603
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
604
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
604
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
605
605
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
606
606
a_buffer [i ][j ][ii ][jj ] = a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ];
607
607
}
@@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
684
684
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
685
685
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
686
686
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
687
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
687
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
688
688
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
689
689
a [block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii ) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj ] = a_buffer [i ][j ][ii ][jj ];
690
690
}
@@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
697
697
for (int i = 0 ; i < BLOCK_SIZE /GEMM_BLOCK ; i ++ ) {
698
698
for (int ii = 0 ; ii < GEMM_BLOCK ; ii ++ ) {
699
699
for (int j = 0 ; j < BLOCK_SIZE /GEMM_BLOCK ; j ++ ) {
700
- __attribute__((opencl_unroll_hint (GLOBAL_MEM_UNROLL )))
700
+ __attribute__((opencl_unroll_hint (GEMM_BLOCK )))
701
701
for (int jj = 0 ; jj < GEMM_BLOCK ; jj ++ ) {
702
702
left_block [(i * GEMM_BLOCK + ii ) * BLOCK_SIZE + j * GEMM_BLOCK + jj ] = a_buffer [j ][i ][jj ][ii ];
703
703
}
0 commit comments