From e78285855f254a7cec56ed9219d933226ea50260 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Fri, 11 Dec 2020 16:38:54 -0800 Subject: [PATCH 1/3] Add gemmini exception handler --- include/gemmini.h | 299 +++++++++++++++++++++++++++------------------- 1 file changed, 173 insertions(+), 126 deletions(-) diff --git a/include/gemmini.h b/include/gemmini.h index 1a91fb1b..b0e3c3ad 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -19,6 +19,8 @@ // Accelerator interface #include "rocc-software/src/xcustom.h" +#define unlikely(x) __builtin_expect(x, 0) + #define k_CONFIG 0 #define k_MVIN2 1 #define k_MVIN 2 @@ -37,6 +39,8 @@ #define k_MVIN3 14 +#define k_FENCE 127 + #define CONFIG_EX 0 #define CONFIG_LD 1 #define CONFIG_ST 2 @@ -176,18 +180,61 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { return un.b; } + #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) \ ROCC_INSTRUCTION_0_R_R(x, rs1, rs2, funct) +// flush +#define gemmini_flush(skip) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH) + + +#define LABEL(i, f) LABEL1(i, f, __LINE__) +#define LABEL1(i, f, l) LABEL2(i, f, l) +#define LABEL2(i, f, l) LABEL_##i##_##f##_##l + +// Handle the exception reported by the return code of a gemmini instruction +// LSB indicates if the page addr is load fault or store fault +void handle_gemmini_xcpt(uint64_t rd) { + char is_st = (uint64_t) rd & 1; + char* addr = (char*) rd; + if (is_st) { + *addr = 0; + } else { + volatile char t = *addr; + } + gemmini_flush(0); +} + +#define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, 0) + +// First, issue the instruction, and then check the return code +// If the return code indicates a pending exception from gemmini, handle it, +// then attempt to re-issue the instruction, +// To maintain the illusion of precise restartable exceptions, all gemmini +// instructions should be in this form (except flush) +#define ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, l) { \ + uint64_t rd = 0; \ + LABEL(l, funct): \ + ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, funct); \ + if (unlikely(rd != 0)) { \ + handle_gemmini_xcpt(rd); \ + goto LABEL(l, funct); \ + } \ + } + + + // mvin and mvout #define gemmini_extended_mvin(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN) #define gemmini_extended_mvin2(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN2) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN2) #define gemmini_extended_mvin3(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN3) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN3) #define gemmini_block_mvin(dram_addr, spad_addr, len) \ gemmini_extended_mvin(dram_addr, spad_addr, (len) * DIM, DIM) @@ -196,17 +243,17 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { gemmini_extended_mvin(dram_addr, spad_addr, DIM, DIM) #define gemmini_extended_mvout(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT) #define gemmini_mvout(dram_addr, spad_addr) \ gemmini_extended_mvout(dram_addr, spad_addr, DIM, DIM) // compute #define gemmini_extended_compute_preloaded(A, BD, A_cols, A_rows, BD_cols, BD_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED) #define gemmini_extended_compute_accumulated(A, BD, A_cols, A_rows, BD_cols, BD_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_ACCUMULATE) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_ACCUMULATE) #define gemmini_compute_preloaded(A, BD) \ gemmini_extended_compute_preloaded(A, BD, DIM, DIM, DIM, DIM) @@ -216,7 +263,7 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { // preload #define gemmini_extended_preload(BD, C, BD_cols, BD_rows, C_cols, C_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD) #define gemmini_preload(BD, C) \ gemmini_extended_preload(BD, C, DIM, DIM, DIM, DIM) @@ -226,19 +273,19 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { // weight-stationary matmul loop #define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) // config #define gemmini_extended2_config_ex(dataflow, act, sys_shift, acc_scale, relu6_shift, A_stride, A_transpose, B_transpose, ocol, row_turn, kdim, stride, channel, row_left, kdim2, weight_double_bank, weight_triple_bank) \ { \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)acc_scale) << 32) | ((uint64_t)(A_stride) << 16) | (B_transpose << 9) | (A_transpose << 8) | ((act) << 3) | ((dataflow) << 2) | CONFIG_EX, ((uint64_t)(relu6_shift) << 32) | (sys_shift), k_CONFIG); \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(XCUSTOM_ACC, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)acc_scale) << 32) | ((uint64_t)(A_stride) << 16) | (B_transpose << 9) | (A_transpose << 8) | ((act) << 3) | ((dataflow) << 2) | CONFIG_EX, ((uint64_t)(relu6_shift) << 32) | (sys_shift), k_CONFIG, 0); \ \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(weight_triple_bank) << 59) | ((uint64_t)(weight_double_bank) << 58) | ((uint64_t)(row_left) << 54) | ((uint64_t)(row_turn) << 42) | CONFIG_IM2COL, ((uint64_t)ocol << 56) | ((uint64_t)kdim2 << 48) | ((uint64_t)kdim << 44) | ((uint64_t)channel << 23) | ((uint64_t)stride << 20), k_CONFIG) \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(XCUSTOM_ACC, ((uint64_t)(weight_triple_bank) << 59) | ((uint64_t)(weight_double_bank) << 58) | ((uint64_t)(row_left) << 54) | ((uint64_t)(row_turn) << 42) | CONFIG_IM2COL, ((uint64_t)ocol << 56) | ((uint64_t)kdim2 << 48) | ((uint64_t)kdim << 44) | ((uint64_t)channel << 23) | ((uint64_t)stride << 20), k_CONFIG, 1) \ } #define gemmini_extended_config_ex(dataflow, act, sys_shift, acc_scale, relu6_shift, A_stride, A_transpose, B_transpose) \ @@ -249,10 +296,10 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { #if defined(HAS_MVIN_SCALE) || defined(HAS_MVIN_ACC_SCALE) #define gemmini_extended3_config_ld(stride, scale, shrunk, id) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(scale_t_to_scale_t_bits(scale)) << 32) | ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(scale_t_to_scale_t_bits(scale)) << 32) | ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) #else #define gemmini_extended2_config_ld(stride, scale, shrunk, id) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) #endif #define gemmini_extended2_config_ld(stride, scale, shrunk) \ @@ -265,17 +312,18 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { gemmini_extended_config_ld(stride, MVIN_SCALE_IDENTITY) #define gemmini_extended_config_st(stride, pool_stride, pool_size, pool_out_dim, porows, pocols, orows, ocols, upad, lpad) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(ocols) << 56) | ((uint64_t)(orows) << 48) | ((uint64_t)(pocols) << 40) | ((uint64_t)(porows) << 32) | ((uint64_t)(pool_out_dim) << 24) | ((uint64_t)(lpad) << 10) | ((uint64_t)(upad) << 8) | ((uint64_t)(pool_size) << 6) | ((uint64_t)(pool_stride) << 4) | CONFIG_ST, stride, k_CONFIG) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(ocols) << 56) | ((uint64_t)(orows) << 48) | ((uint64_t)(pocols) << 40) | ((uint64_t)(porows) << 32) | ((uint64_t)(pool_out_dim) << 24) | ((uint64_t)(lpad) << 10) | ((uint64_t)(upad) << 8) | ((uint64_t)(pool_size) << 6) | ((uint64_t)(pool_stride) << 4) | CONFIG_ST, stride, k_CONFIG) #define gemmini_config_st(stride) \ gemmini_extended_config_st(stride, 0, 0, 0, 0, 0, 0, 0, 0, 0) -// flush -#define gemmini_flush(skip) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH) - // fence -#define gemmini_fence() asm volatile("fence") +// First, spin until no instructions are busy in gemmini, with k_FENCE +// Then, use the generic fence +#define gemmini_fence() { \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, 0, 0, k_FENCE); \ + asm volatile("fence"); \ + } // Tiling functions static void sp_tiled_matmul_os(const elem_t * A, const elem_t * B, const void * D, void * C, @@ -1147,7 +1195,7 @@ void sp_tiled_conv( // Compute // printf("compute\n"); - // if (icols_per_systolic_row == 1 || + // if (icols_per_systolic_row == 1 || for (int b = 0; b < batches; b++) for (int orow = 0; orow < orows; orow++) for (int ocol = 0; ocol < ocols; ocol += DIM) { @@ -1278,7 +1326,7 @@ void sp_tiled_conv_ds( // Calculate image dimensions const int irows = (orows - 1) * stride + 1; - const int icols = (ocols - 1) * stride + 1;//kcols; + const int icols = (ocols - 1) * stride + 1;//kcols; const int ichs = kchs; const int im2col_height = ocols*orows; @@ -1287,9 +1335,9 @@ void sp_tiled_conv_ds( const int row_turn = row_left == 0 ? im2col_height/DIM - 1 : im2col_height/DIM; const int double_bank = weight_bank > 1 ? 1 : 0; const int triple_bank = weight_bank > 2 ? 1 : 0; - + int odims = im2col_height; - + gemmini_extended2_config_ex(WEIGHT_STATIONARY, act, 0, scale, relu6_shift, 1, false, false, ocols, row_turn, 1, stride, kchs, row_left, 1, double_bank, triple_bank); //if want 2 banks for weight, last is 1 const uint32_t A_sp_addr_start = 0; @@ -1322,7 +1370,7 @@ void sp_tiled_conv_ds( gemmini_config_ld(out_channels*sizeof(elem_t)); for (int och = 0; och < ochs; och += DIM) { const int J = ochs - och > DIM ? DIM : ochs - och; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs; for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; gemmini_extended_mvin(weights + kch * out_channels + och, @@ -1333,7 +1381,7 @@ void sp_tiled_conv_ds( } // gemmini_fence(); - int idims = irows*icols; + int idims = irows*icols; int bidims = batches*idims; // mvin input // printf("mvin inputs\n"); @@ -1359,15 +1407,15 @@ int bidims = batches*idims; if(odims > DIM){ //output dimension (row*col) bigger than DIM for (int b = 0; b < batches; b++){ for (int och = 0; och < ochs; och += DIM) { - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs;// + kch; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs;// + kch; const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; for (int kch = 0; kch < kchs; kch += DIM) { // gemmini_extended_mvin(weights + kch * out_channels + och, // B_sp_addr+kch, // DIM, DIM); - - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; gemmini_extended_preload(B_sp_addr+kch, C_sp_addr+odim, @@ -1376,7 +1424,7 @@ int bidims = batches*idims; } } // if(output!=NULL) gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - + } } }else{//ds layer @@ -1393,7 +1441,7 @@ int bidims = batches*idims; DIM, DIM); } */ - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*bidims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*bidims + b*idims; gemmini_extended_preload(B_sp_addr, C_sp_addr, DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, odims, DIM, odims); @@ -1426,7 +1474,7 @@ int bidims = batches*idims; DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr+7*bidims, GARBAGE_ADDR, DIM, odims, DIM, odims); kch_bound = kch + 8*DIM; - + } //if kch is not divisible by DIM for (; kch_bound < kchs; kch_bound += DIM) { @@ -1435,16 +1483,16 @@ int bidims = batches*idims; // gemmini_extended_mvin(weights + kch_bound * out_channels + och, // B_sp_addr, // DIM, DIM); - const uint32_t A_sp_addr = A_sp_addr_start + (kch_bound / DIM)*bidims + b*idims; - + const uint32_t A_sp_addr = A_sp_addr_start + (kch_bound / DIM)*bidims + b*idims; + gemmini_extended_preload(B_sp_addr, C_sp_addr, DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, odims, DIM, odims); - + } // const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; // if(output!=NULL) gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - + } } } @@ -1457,7 +1505,7 @@ int bidims = batches*idims; for(int och = 0; och < ochs; och += DIM){ const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - } + } } } @@ -1489,11 +1537,11 @@ void sp_tiled_conv_dw( const int ocols = pocols * pool_stride + pool_size - 1 - plpad - prpad; // Calculate image dimensions const int irows = (orows - 1) * stride + kernel_dim; - const int icols = (ocols - 1) * stride + kernel_dim;//kcols; + const int icols = (ocols - 1) * stride + kernel_dim;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; int kchs = 1; - int kdims = kernel_dim * kernel_dim; + int kdims = kernel_dim * kernel_dim; int double_bank = 0;//weight_bank > 1 ? 1 : 0; int triple_bank = 0;//weight_bank > 2 ? 1 : 0; @@ -1510,7 +1558,7 @@ void sp_tiled_conv_dw( const uint32_t D_sp_addr_start = 1 << (ADDR_LEN - 1); const uint32_t C_sp_addr_start = 3 << (ADDR_LEN - 2); - if (!no_bias && bias != NULL) { + if (!no_bias && bias != NULL) { gemmini_config_ld(0); for (int b = 0; b < batches; b++){ const int J = 1; @@ -1522,7 +1570,7 @@ void sp_tiled_conv_dw( J, I); } } - } + } if (mvin_weight) { // mvin weights @@ -1530,7 +1578,7 @@ void sp_tiled_conv_dw( gemmini_config_ld(out_channels * sizeof(elem_t)); for (int krow = 0; krow < kernel_dim; krow++){ const uint32_t B_sp_addr = B_sp_addr_start+ krow*kernel_dim; - + for (int kcol = 0; kcol < kernel_dim; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim + kcol) * out_channels, B_sp_addr+kcol, @@ -1538,7 +1586,7 @@ void sp_tiled_conv_dw( } } } - + // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); @@ -1553,13 +1601,13 @@ void sp_tiled_conv_dw( int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ @@ -1588,14 +1636,14 @@ void sp_tiled_conv_dw( } } // gemmini_fence(); - + // gemmini_config_ld(0); for (int b = 0; b < batches; b++){ const int J = 1; //const uint32_t D_sp_addr = D_sp_addr_start + b * odims;// + odim; const uint32_t C_sp_addr_outer = C_sp_addr_start + b * odims;// + odim; - - const uint32_t A_sp_addr = A_sp_addr_start + b*idims; + + const uint32_t A_sp_addr = A_sp_addr_start + b*idims; const int kkdims = kdims; const uint32_t B_sp_addr = B_sp_addr_start; const int K = 1; @@ -1607,11 +1655,11 @@ void sp_tiled_conv_dw( // J, I); const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); - + } // gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels, C_sp_addr_outer, J, 0); // } @@ -1631,7 +1679,7 @@ void sp_tiled_conv_dw( gemmini_extended_mvout(output + (b*out_dim*out_dim + orow*out_dim + ocol) * out_channels, C_sp_addr, 1, I); - } + } } //for first layer @@ -1666,8 +1714,8 @@ void sp_tiled_conv_first( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + 1;//krows; - int kdims = krows*krows; + const int icols = (ocols - 1) * stride + 1;//krows; + int kdims = krows*krows; const int ichs = kchs*krows; //pack rows (kchs: normal channel number) int double_bank = weight_bank > 1 ? 1 : 0; @@ -1719,7 +1767,7 @@ void sp_tiled_conv_first( J, K); } } - } + } } // mvin input // printf("mvin inputs\n"); @@ -1729,7 +1777,7 @@ void sp_tiled_conv_first( for (int icol = 0; icol < icols;) { int I = icols - icol > DIM ? DIM : icols- icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow * icols + icol; for (int ich = 0; ich < ichs; ich += DIM) { const int K = ichs - ich > DIM ? DIM : ichs - ich; @@ -1750,13 +1798,13 @@ void sp_tiled_conv_first( const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; for (int kch = 0; kch < ichs; kch += DIM) { //treat as 3x7=21 channels const int K = ichs - kch > DIM ? DIM : ichs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*krows;//kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs * kdims + kch*krows; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr+odim, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -1771,7 +1819,7 @@ void sp_tiled_conv_first( C_sp_addr, J, 0); } -*/ +*/ } } @@ -1823,11 +1871,11 @@ void sp_tiled_conv_ws_original( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int idims = irows*icols; int bidims = batches*irows*icols; @@ -1873,7 +1921,7 @@ void sp_tiled_conv_ws_original( const int K = kchs - kch > DIM ? DIM : kchs - kch; for (int krow = 0; krow < krows; krow++){ const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims + krow*krows*K; - + for (int kcol = 0; kcol < krows; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + och, B_sp_addr+kcol*K, @@ -1885,7 +1933,7 @@ void sp_tiled_conv_ws_original( // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); - static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; + static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; // gemmini_fence(); // TODO fix ROB to get rid of this requirement for (int b = 0; b < batches; b++) { for (int irow = -upad; irow < irows_unpadded + dpad; irow++) { @@ -1894,17 +1942,17 @@ void sp_tiled_conv_ws_original( for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ - gemmini_config_ld(0); + gemmini_config_ld(0); for (int ich = 0; ich < ichs; ich += DIM) { const int K = ichs - ich > DIM ? DIM : ichs - ich; in = &zeros[0]; @@ -1932,11 +1980,11 @@ void sp_tiled_conv_ws_original( const int J = ochs - och > DIM ? DIM : ochs - och; for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; - const int C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims + odim; + const int C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims + odim; for(int kkdim = 0; kkdim < K*kdims; kkdim += K){ gemmini_extended_preload(B_sp_addr+kkdim, C_sp_addr, J, K, J, I); @@ -1963,7 +2011,7 @@ void sp_tiled_conv_ws_original( C_sp_addr, J, I); } - + } } else { @@ -2017,11 +2065,11 @@ void sp_tiled_conv_ws_original_first( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int idims = irows*icols; int bidims = batches*irows*icols; @@ -2058,7 +2106,7 @@ int bidims = batches*irows*icols; // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); - static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; + static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; for (int b = 0; b < batches; b++) { for (int irow = -upad; irow < irows_unpadded + dpad; irow++) { const int irow_padded = irow + upad; @@ -2066,17 +2114,17 @@ int bidims = batches*irows*icols; for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ - gemmini_config_ld(0); + gemmini_config_ld(0); in = &zeros[0]; gemmini_extended_mvin(in, A_sp_addr, @@ -2101,14 +2149,14 @@ int bidims = batches*irows*icols; const int K = kchs;//kchs - kch > DIM ? DIM : kchs - kch; for (int krow = 0; krow < krows; krow++){ const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + krow*krows*K; - + for (int kcol = 0; kcol < krows; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim*in_channels + kcol*in_channels) * out_channels + och, B_sp_addr+kcol*K, J, K); } } - + } } @@ -2117,10 +2165,10 @@ int bidims = batches*irows*icols; for (int b = 0; b < batches; b++){ for (int och = 0; och < ochs; och += DIM) { const int J = ochs - och > DIM ? DIM : ochs - och; - const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; + const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; const int K = kchs;// - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + b*idims; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs;// + kch*kdims; + const uint32_t A_sp_addr = A_sp_addr_start + b*idims; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs;// + kch*kdims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; for(int kkdim = 0; kkdim < K*kdims; kkdim += K){ @@ -2129,13 +2177,13 @@ int bidims = batches*irows*icols; gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); } } -/* - //attempt to merge matmul and mvout +/* + //attempt to merge matmul and mvout elem_t * pout = output + (b * pool_out_dim * pool_out_dim)*out_channels + och; gemmini_extended_mvout(pout, C_sp_addr, J, 0); -*/ +*/ } } @@ -2576,9 +2624,9 @@ void tiled_conv_first( //for (int kch = 0; kch < in_channels; kch += kchs) { elem_t * out = output + (b*pool_out_dim*pool_out_dim + porow*pool_out_dim + pocol) * out_channels + poch; - + acc_t * bias_ = bias + poch; - + const int batches_ = batch_size - b > batches ? batches : batch_size - b; const int porows_ = pool_out_dim - porow > porows ? porows : pool_out_dim - porow; const int pocols_ = pool_out_dim - pocol > pocols ? pocols : pool_out_dim - pocol; @@ -2616,7 +2664,7 @@ void tiled_conv_first( lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, - input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels,// + kch, + input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels,// + kch, weights + poch, //weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + poch, out, @@ -2625,7 +2673,7 @@ void tiled_conv_first( act, scale, relu6_shift, no_bias, no_pool, mvin_weight, weight_bank); - + } } } @@ -2647,9 +2695,9 @@ void tiled_conv_first( const int ocol_floored = ocol < 0 ? 0 : ocol; const int icol = ocol_floored * stride - padding; //+ kcol - padding; elem_t * out = output + (b*pool_out_dim*pool_out_dim + porow*pool_out_dim + pocol) * out_channels + poch; - + acc_t * bias_ = bias + poch; - + const int batches_ = batch_size - b > batches ? batches : batch_size - b; const int porows_ = pool_out_dim - porow > porows ? porows : pool_out_dim - porow; @@ -2730,11 +2778,11 @@ void sp_tiled_conv_ws( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int double_bank = weight_bank > 1 ? 1 : 0; int triple_bank = weight_bank > 2 ? 1 : 0; @@ -2789,13 +2837,13 @@ void sp_tiled_conv_ws( for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ @@ -2832,15 +2880,15 @@ void sp_tiled_conv_ws( for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; - + for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -2867,7 +2915,7 @@ void sp_tiled_conv_ws( C_sp_addr, J, I); } - + } } else { @@ -2897,10 +2945,10 @@ void sp_tiled_conv_ws( for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; - + for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; if(kch == 0) @@ -2909,7 +2957,7 @@ void sp_tiled_conv_ws( J, I); const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -2972,7 +3020,7 @@ void sp_tiled_conv_ws( const int J = ochs - och > DIM ? DIM : ochs - och; const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, J, 0); - } + } //} */ @@ -3141,7 +3189,7 @@ void tiled_conv_original( act, scale, relu6_shift, no_bias, no_pool, weight_bank); - + else //downsampling layer sp_tiled_conv_ds( batch_size, in_dim, in_channels, @@ -3157,14 +3205,14 @@ void tiled_conv_original( plpad, prpad, pupad, pdpad, input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels + kch, - 0, + 0, weights + kch * out_channels + poch, //weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + poch, out, bias_, act, scale, relu6_shift, - no_bias, no_pool, + no_bias, no_pool, weight_bank); } } @@ -3254,7 +3302,7 @@ void tiled_conv( #endif int kdims = kcols*kcols; const uint32_t B_sp_addr_start = (BANK_NUM - weight_bank) * BANK_ROWS; - + const int pool_out_dim = (out_dim + 2*pool_padding - pool_size) / pool_stride + 1; for (int poch = 0; poch < out_channels; poch += pochs) { @@ -3279,7 +3327,7 @@ void tiled_conv( } } } - } + } for (int b = 0; b < batch_size; b += batches) { for (int porow = 0; porow < pool_out_dim; porow += porows) { const int orow = porow * pool_stride - pool_padding; @@ -3293,7 +3341,7 @@ void tiled_conv( const int irow = orow_floored * stride - padding;//+ krow - padding; const int ocol_floored = ocol < 0 ? 0 : ocol; const int icol = ocol_floored * stride - padding; //+ kcol - padding; - + const int ocols_ = pocols_ * pool_stride + pool_size - 1; const int orows_ = porows_ * pool_stride + pool_size - 1; @@ -3523,10 +3571,10 @@ void tiled_conv_auto_dw( args[1]--; args[2]--; }else{ - args[0]--; + args[0]--; } - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } int batches = args[0]; @@ -3561,7 +3609,7 @@ void tiled_conv_auto_dw( act, scale, relu6_shift, pool_size, no_pool ? 0 : pool_stride, pool_padding, - + tiled_conv_type); } @@ -3579,13 +3627,13 @@ void tiled_conv_auto_original( int act, acc_scale_t scale, size_t relu6_shift, int pool_size, int pool_stride, int pool_padding, - + enum tiled_matmul_type_t tiled_conv_type) { const int weight_bank = in_channels > 500? 3 : 2; const bool no_pool = pool_stride == 0 || (pool_stride == 1 && pool_size == 1 && pool_padding == 0); const bool no_1d = no_pool; //Todo: change to 1d - + if (no_pool) { pool_size = 1; pool_stride = 1; @@ -3636,7 +3684,7 @@ void tiled_conv_auto_original( } else args[0]--; } - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); int spad_rows_input = tiled_conv_total_spad_rows(false, false, @@ -3645,7 +3693,7 @@ void tiled_conv_auto_original( while(spad_rows_input > BANK_ROWS*(BANK_NUM-weight_bank)){// tile input last - //batch, input dimension, input channel + //batch, input dimension, input channel int max_val = -1; int max_idx = -1; for(int i = 0; i < 5; i++){ @@ -3665,7 +3713,7 @@ void tiled_conv_auto_original( } else args[max_idx]--; - spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } @@ -3766,14 +3814,14 @@ void tiled_conv_auto_largeC( } args[max_idx]--; - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); int spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); while(spad_rows_input > BANK_ROWS*(BANK_NUM-weight_bank)){// tile input last - //batch, input dimension + //batch, input dimension int max_val = -1; int max_idx = -1; for(int i = 0; i < 3; i++){ @@ -3784,7 +3832,7 @@ void tiled_conv_auto_largeC( } args[max_idx]--; - spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } int batches = args[0]; @@ -3850,7 +3898,7 @@ void tiled_conv_auto( int args[] = {batch_size, pool_out_dim, pool_out_dim, out_channels, in_channels}; int och_floor = (args[3]/DIM) + 1; - + int spad_rows_weight = tiled_conv_total_spad_rows(false, true, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); while(spad_rows_weight > BANK_ROWS * weight_bank){ //tile weight first (allocate bank3 to weight) @@ -3860,7 +3908,7 @@ void tiled_conv_auto( spad_rows_weight = tiled_conv_total_spad_rows(false, true, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } - + int acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); @@ -3869,7 +3917,7 @@ void tiled_conv_auto( args[1]--; args[2]--; - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); @@ -3905,7 +3953,7 @@ void tiled_conv_auto( act, scale, relu6_shift, pool_size, no_pool ? 0 : pool_stride, pool_padding, - + weight_bank, tiled_conv_type); } @@ -4080,4 +4128,3 @@ void tiled_resadd_auto(const size_t I, const size_t J, #undef abs #endif // SRC_MAIN_C_GEMMINI_H - From 63a8d43ef25493b09d234d4a8d3269ae9d4e64c1 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 17 Dec 2020 10:14:44 -0800 Subject: [PATCH 2/3] Only use xcpt-handler in debug mode --- include/gemmini.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/gemmini.h b/include/gemmini.h index b0e3c3ad..3c7cc98c 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -197,17 +197,26 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { // LSB indicates if the page addr is load fault or store fault void handle_gemmini_xcpt(uint64_t rd) { char is_st = (uint64_t) rd & 1; - char* addr = (char*) rd; + char* addr = (char*) (rd & ~1); if (is_st) { + printf("[GEMMINI] Store page fault at %x\n", addr); *addr = 0; } else { + printf("[GEMMINI] Load page fault at %x\n", addr); volatile char t = *addr; } gemmini_flush(0); } +#ifdef GEMMINI_XCPT_DEBUG #define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, 0) +#else +#define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ + ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) +#endif + + // First, issue the instruction, and then check the return code // If the return code indicates a pending exception from gemmini, handle it, From b67e6cf322a46a2e3e99024453feacc934ad8688 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 17 Dec 2020 10:18:03 -0800 Subject: [PATCH 3/3] Add comment on GEMMINI_XCPT_DEBUG --- include/gemmini.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/gemmini.h b/include/gemmini.h index 3c7cc98c..d3ecadb0 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -208,6 +208,11 @@ void handle_gemmini_xcpt(uint64_t rd) { gemmini_flush(0); } +// When GEMMINI_XCPT_DEBUG is set, software will be compiled +// to check exception status in gemmini. Page faults will attempt +// to be resolved in the handle_gemmini_xcpt_routine. +// When GEMMINI_XCPT_DEBUG is unset, exception status will not be +// checked, so software must not cause page faults in gemmini #ifdef GEMMINI_XCPT_DEBUG #define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, 0)