From d3d6e689001cf43931110aed86498de6c0ef14c6 Mon Sep 17 00:00:00 2001 From: benjamin Date: Wed, 5 Jun 2024 20:01:21 +0300 Subject: [PATCH] removed SIMD for MSVC --- CMakeLists.txt | 8 +- gbrl/src/cpp/CMakeLists.txt | 3 +- gbrl/src/cpp/fitter.cpp | 8 +- gbrl/src/cpp/loss.cpp | 8 +- gbrl/src/cpp/math_ops.cpp | 192 ++++++++++++++++----- gbrl/src/cpp/math_ops.h | 6 + gbrl/src/cpp/node.cpp | 48 +++++- gbrl/src/cpp/optimizer.cpp | 4 + gbrl/src/cpp/predictor.cpp | 8 +- gbrl/src/cpp/split_candidate_generator.cpp | 15 +- 10 files changed, 233 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae6c4a9..8d72b4f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,11 +125,6 @@ if(APPLE) find_package(OpenMP REQUIRED) endif() else() - # Set OpenMP runtime for MSVC - if (MSVC) - set(OpenMP_RUNTIME_MSVC experimental) - message(STATUS "OpenMP_RUNTIME_MSVC set to ${OpenMP_RUNTIME_MSVC}") - endif() find_package(OpenMP REQUIRED) set(OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}) set(OpenMP_CXX_LIB_NAMES ${OpenMP_CXX_LIB_NAMES}) @@ -197,7 +192,8 @@ elseif (WIN32) target_link_libraries(gbrl_cpp PRIVATE OpenMP::OpenMP_CXX) if (USE_CUDA) set(cuda_lib_path "${CUDAToolkit_ROOT_DIR}/lib/x64") - target_link_libraries(gbrl_cpp PRIVATE ${cuda_lib_path}/cudart.lib) + # target_link_libraries(gbrl_cpp PRIVATE ${cuda_lib_path}/cudart.lib) + target_link_libraries(gbrl_cpp PRIVATE CUDA::cudart) endif() endif() diff --git a/gbrl/src/cpp/CMakeLists.txt b/gbrl/src/cpp/CMakeLists.txt index e869e0f..487e6fb 100755 --- a/gbrl/src/cpp/CMakeLists.txt +++ b/gbrl/src/cpp/CMakeLists.txt @@ -33,8 +33,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Linux") elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++14 ${OpenMP_C_FLAGS} -Wall -Wpedantic -Wextra") elseif (WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++14 ${OpenMP_CXX_FLAGS} /O2 /W3 ") - message(STATUS ${OpenMP_CXX_FLAGS} ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /std:c++14 ${OpenMP_CXX_FLAGS} /W3") endif() if (CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/gbrl/src/cpp/fitter.cpp b/gbrl/src/cpp/fitter.cpp index 136c087..6a355ce 100755 --- a/gbrl/src/cpp/fitter.cpp +++ b/gbrl/src/cpp/fitter.cpp @@ -177,7 +177,9 @@ float Fitter::fit_cpu(dataSet *dataset, const float* targets, ensembleData *edat if (metadata->split_score_func == L2){ float *mean_grads = calculate_mean(build_grads, batch_n_samples, output_dim, par_th); float *std = calculate_var_and_center(build_grads, mean_grads, batch_n_samples, output_dim, par_th); - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < output_dim; ++i) std[i] = sqrtf(std[i]); divide_mat_by_vec_inplace(build_grads, std, batch_dataset.n_samples, metadata->output_dim, metadata->par_th); @@ -527,7 +529,9 @@ void Fitter::calc_leaf_value(dataSet *dataset, ensembleData *edata, ensembleMeta } if (passed){ idx = i*output_dim; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < output_dim; ++d) edata->values[leaf_idx*output_dim + d] += grads[idx + d]; count += 1; diff --git a/gbrl/src/cpp/loss.cpp b/gbrl/src/cpp/loss.cpp index 19bbde0..a554353 100755 --- a/gbrl/src/cpp/loss.cpp +++ b/gbrl/src/cpp/loss.cpp @@ -25,7 +25,9 @@ float MultiRMSE::get_loss_and_gradients(const float *raw_preds, const float *raw int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i){ row = i / output_dim; col = i % output_dim; @@ -55,7 +57,9 @@ float MultiRMSE::get_loss(const float *raw_preds, const float *raw_targets, cons int end_idx = (thread_id == n_threads - 1) ? n_samples : start_idx + samples_per_thread; for (int sample_idx = start_idx; sample_idx < end_idx; ++sample_idx){ row = sample_idx * output_dim; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < output_dim; ++d){ grad_value = raw_preds[row + d] - raw_targets[row + d]; losses[thread_id] += (grad_value * grad_value); diff --git a/gbrl/src/cpp/math_ops.cpp b/gbrl/src/cpp/math_ops.cpp index 7962e6b..550ea13 100755 --- a/gbrl/src/cpp/math_ops.cpp +++ b/gbrl/src/cpp/math_ops.cpp @@ -25,14 +25,18 @@ void add_vec_to_mat(float *mat, const float *vec, const int n_samples, const int int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; mat[i] += vec[col]; } } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; mat[i] += vec[col]; @@ -51,14 +55,18 @@ void multiply_mat_by_vec_subtract_result(float *result, const float *mat, const int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; result[i] -= (mat[i]*vec[col]); } } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; result[i] -= (mat[i]*vec[col]); @@ -78,14 +86,18 @@ void divide_mat_by_vec_inplace(float *mat, const float *vec, const int n_samples int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; mat[i] /= (vec[col] + 1e-8f); } } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; mat[i] /= (vec[col] + 1e-8f); @@ -104,14 +116,18 @@ void subtract_vec_from_mat(float *mat, float *vec, const int n_samples, const in int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; mat[i] -= vec[col]; } } } else { - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; mat[i] -= vec[col]; @@ -131,13 +147,17 @@ void multiply_mat_by_scalar(float *mat, float scalar, const int n_samples, const int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { mat[i] *= scalar; } } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { mat[i] *= scalar; } @@ -148,7 +168,9 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c int n_elements = n_samples * n_cols; float *mean = new float[n_cols]; float n_samples_recip = 1.0f / static_cast(n_samples); +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) mean[d] = 0.0f; int n_threads = calculate_num_threads(n_elements, par_th); @@ -156,7 +178,9 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c omp_set_num_threads(n_threads); int elements_per_thread = (n_elements) / n_threads; float *thread_mean = new float[n_threads*n_cols]; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) thread_mean[d] = 0.0f; #pragma omp parallel @@ -164,27 +188,35 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; thread_mean[thread_id * n_cols + col] += mat[i]; } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) { int col = d % n_cols; mean[col] += thread_mean[d]; } delete[] thread_mean; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; mean[col] += mat[i]; } } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) mean[d] *= n_samples_recip; @@ -197,7 +229,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c float value; float *var = new float[n_cols]; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) { var[d] = 0; } @@ -207,7 +241,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c int elements_per_thread = n_elements / n_threads; omp_set_num_threads(n_threads); float *thread_var = new float[n_threads*n_cols]; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d){ thread_var[d] = 0; } @@ -216,21 +252,27 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; thread_var[thread_id * n_cols + col] += (value * value); } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) { int col = d % n_cols; var[col] += thread_var[d]; } delete[] thread_var; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; @@ -238,7 +280,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c } } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) { var[d] *= n_samples_recip; } @@ -250,7 +294,9 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in int n_elements = n_samples * n_cols; // assumes both matrices are centered float *cov = new float[n_cols]; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) cov[d] = 0.0f; @@ -261,7 +307,9 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in int elements_per_thread = n_elements / n_threads; omp_set_num_threads(n_threads); float *thread_cov = new float[n_threads*n_cols]; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) thread_cov[d] = 0.0f; #pragma omp parallel @@ -269,27 +317,35 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; thread_cov[thread_id * n_cols + col] += (mat_l[i] * mat_r[i]); } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) { int col = d % n_cols; cov[col] += thread_cov[d]; } delete[] thread_cov; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; cov[col] += ((mat_l[i] * mat_r[i])); } } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) cov[d] *= n_samples_recip; @@ -302,7 +358,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl float n_samples_recip = 1.0f / (static_cast(n_samples) - 1.0f); float value; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) var[d] = 0; @@ -311,7 +369,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl int elements_per_thread = n_elements / n_threads; omp_set_num_threads(n_threads); float *thread_var = new float[n_threads*n_cols]; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) thread_var[d] = 0.0f; #pragma omp parallel @@ -319,7 +379,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; @@ -327,14 +389,18 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl mat[i] -= mean[col]; } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) { int col = d % n_cols; var[col] += thread_var[d]; } delete[] thread_var; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; @@ -343,7 +409,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl } } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) var[d] *= n_samples_recip; @@ -356,7 +424,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl float n_samples_recip = 1.0f / (static_cast(n_samples) - 1.0f); float value; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) var[d] = 0; @@ -365,7 +435,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl int elements_per_thread = n_elements / n_threads; omp_set_num_threads(n_threads); float *thread_var = new float[n_threads*n_cols]; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) thread_var[d] = 0; #pragma omp parallel @@ -373,7 +445,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; @@ -381,14 +455,18 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl mat[i] -= mean[col]; } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads*n_cols; ++d) { int col = d % n_cols; var[col] += thread_var[d]; } delete[] thread_var; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; value = mat[i] - mean[col]; @@ -397,7 +475,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl } } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) var[d] = sqrtf(var[d] * n_samples_recip); @@ -415,7 +495,9 @@ float* copy_mat(const float *mat, const int size, const int par_th){ int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) copied_mat[i] = mat[i]; } @@ -436,7 +518,9 @@ void _element_wise_addition(float *mat_l, const float *mat_r, const int size, co int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) mat_l[i] += mat_r[i]; } @@ -457,7 +541,9 @@ float* element_wise_division(const float *mat_l, const float *mat_r, const int s int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) result[i] = mat_l[i] / (mat_r[i] + + 1e-8f); } @@ -478,12 +564,16 @@ void set_zero_mat(float *mat, const int size, const int par_th){ int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) mat[i] = 0.0f; } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < size; ++i) mat[i] = 0.0f; } @@ -516,21 +606,27 @@ float* calculate_max(const float *mat, const int n_samples, const int n_cols, co int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; thread_max[thread_id * n_cols + col] = mat[i] > thread_max[thread_id * n_cols + col] ? mat[i] : thread_max[thread_id * n_cols + col] ; } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads * n_cols; ++d) { int col = d % n_cols; max[col] = max[col] > thread_max[d] ? max[col] : thread_max[d]; } delete[] thread_max; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; max[col] = max[col] > mat[i] ? max[col] : mat[i]; @@ -559,21 +655,27 @@ float* calculate_min(const float *mat, const int n_samples, const int n_cols, co int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int col = i % n_cols; thread_min[thread_id * n_cols + col] = mat[i] < thread_min[thread_id * n_cols + col] ? mat[i] : thread_min[thread_id * n_cols + col] ; } } - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_threads * n_cols; ++d) { int col = d % n_cols; min[col] = min[col] < thread_min[d] ? min[col] : thread_min[d]; } delete[] thread_min; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int col = i % n_cols; min[col] = min[col] < mat[i] ? min[col] : mat[i]; @@ -594,14 +696,18 @@ void calculate_squared_norm(float *norm, const float *mat, const int n_samples, int thread_id = omp_get_thread_num(); int start_idx = thread_id * elements_per_thread; int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread; - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i) { int row = i / n_cols; norm[row] += mat[i]*mat[i]; } } } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int i = 0; i < n_elements; ++i) { int row = i / n_cols; norm[row] += mat[i]*mat[i]; diff --git a/gbrl/src/cpp/math_ops.h b/gbrl/src/cpp/math_ops.h index ff86d91..c6ccae4 100755 --- a/gbrl/src/cpp/math_ops.h +++ b/gbrl/src/cpp/math_ops.h @@ -32,7 +32,9 @@ void calculate_squared_norm(float *norm, const float *mat, const int n_samples, inline float mat_vec_dot_sum(const int *indices, const float *grads, const float *vec, const int n_samples, const int n_cols){ float sum = 0.0f; +#ifndef _MSC_VER #pragma omp simd +#endif for (int i = 0; i < n_samples*n_cols; ++i){ int row = i / n_cols; int col = i % n_cols; @@ -43,7 +45,9 @@ inline float mat_vec_dot_sum(const int *indices, const float *grads, const float inline float norm(const float *vec, const int n_samples){ float sum = 0.0f; +#ifndef _MSC_VER #pragma omp simd +#endif for (int n = 0; n < n_samples; ++n){ sum += (vec[n]*vec[n]); } @@ -52,7 +56,9 @@ inline float norm(const float *vec, const int n_samples){ inline float squared_norm(const float *vec, const int n_samples){ float sum = 0.0f; +#ifndef _MSC_VER #pragma omp simd +#endif for (int n = 0; n < n_samples; ++n){ sum += (vec[n]*vec[n]); } diff --git a/gbrl/src/cpp/node.cpp b/gbrl/src/cpp/node.cpp index 93789aa..ea76bc1 100755 --- a/gbrl/src/cpp/node.cpp +++ b/gbrl/src/cpp/node.cpp @@ -185,7 +185,9 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo const int *sample_indices = this->sample_indices; float *left_mean = new float[n_cols]; float *right_mean = new float[n_cols]; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] = 0; right_mean[d] = 0; @@ -198,14 +200,18 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo sample_idx = sample_indices[n]; grad_row = sample_idx*n_cols; if (obs[sample_idx*n_features + split_candidate.feature_idx] > split_candidate.feature_value){ - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) right_mean[d] += grads[grad_row + d]; right_norms += grads_norm[sample_idx]; right_indices[right_count] = sample_idx; ++right_count; } else { - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) left_mean[d] += grads[grad_row + d]; left_norms += grads_norm[sample_idx]; @@ -226,7 +232,9 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo float left_count_f = static_cast(left_count), right_count_f = static_cast(right_count); float left_count_recip = (left_count > 0 ) ? 1.0f / left_count_f : 0.0f; float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] *= left_count_recip; right_mean[d] *= right_count_recip; @@ -252,7 +260,9 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads, const int *sample_indices = this->sample_indices; float *left_mean = new float[n_cols]; float *right_mean = new float[n_cols]; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] = 0; right_mean[d] = 0; @@ -265,14 +275,18 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads, sample_idx = sample_indices[n]; grad_row = sample_idx*n_cols; if (strcmp(&obs[(sample_idx*n_features + split_candidate.feature_idx) * MAX_CHAR_SIZE], split_candidate.categorical_value) == 0){ - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) right_mean[d] += grads[grad_row + d]; right_norms += grads_norm[sample_idx]; right_indices[right_count] = sample_idx; ++right_count; } else { - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) left_mean[d] += grads[grad_row + d]; left_norms += grads_norm[sample_idx]; @@ -293,7 +307,9 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads, float left_count_f = static_cast(left_count), right_count_f = static_cast(right_count); float left_count_recip = (left_count > 0 ) ? 1.0f / left_count_f : 0.0f; float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] *= left_count_recip; right_mean[d] *= right_count_recip; @@ -318,7 +334,9 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa float *left_mean = new float[n_cols]; float *right_mean = new float[n_cols]; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] = 0; right_mean[d] = 0; @@ -329,12 +347,16 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa sample_idx = sample_indices[n]; grad_row = sample_idx*n_cols; if (obs[sample_idx*n_features + split_candidate.feature_idx] > split_candidate.feature_value){ - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) right_mean[d] += grads[grad_row + d]; ++right_count; } else { - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) left_mean[d] += grads[grad_row + d]; ++left_count; @@ -350,7 +372,9 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa float left_count_f = static_cast(left_count), right_count_f = static_cast(right_count); float left_count_recip = (left_count > 0 ) ? 1.0f / left_count : 0.0f; float right_count_recip = (right_count > 0) ? 1.0f / right_count_f : 0.0f; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] *= left_count_recip; right_mean[d] *= right_count_recip; @@ -369,7 +393,9 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con float *left_mean = new float[n_cols]; float *right_mean = new float[n_cols]; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] = 0; right_mean[d] = 0; @@ -380,12 +406,16 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con sample_idx = sample_indices[n]; grad_row = sample_idx*n_cols; if (strcmp(&obs[(sample_idx*n_features + split_candidate.feature_idx) * MAX_CHAR_SIZE], split_candidate.categorical_value) == 0){ - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) right_mean[d] += grads[grad_row + d]; ++right_count; } else { - #pragma omp simd + #ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d) left_mean[d] += grads[grad_row + d]; ++left_count; @@ -402,7 +432,9 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con float left_count_f = static_cast(left_count), right_count_f = static_cast(right_count); float left_count_recip = (left_count > 0 ) ? 1.0f / left_count : 0.0f; float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f; + #ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ left_mean[d] *= left_count_recip; right_mean[d] *= right_count_recip; diff --git a/gbrl/src/cpp/optimizer.cpp b/gbrl/src/cpp/optimizer.cpp index 85ea47c..8c6af96 100755 --- a/gbrl/src/cpp/optimizer.cpp +++ b/gbrl/src/cpp/optimizer.cpp @@ -105,7 +105,9 @@ SGDOptimizer::SGDOptimizer(schedulerFunc schedule_func, float init_lr, float sto void SGDOptimizer::step(float *theta, const float *raw_grad_theta, int t, int sample_idx){ int start_idx = this->start_idx, end_idx = this->end_idx; float lr = this->scheduler->get_lr(t); +#ifndef _MSC_VER #pragma omp simd +#endif for (int i = start_idx; i < end_idx; i++){ theta[sample_idx + i] -= lr * raw_grad_theta[i]; } @@ -259,7 +261,9 @@ void AdamOptimizer::step(float *theta, const float *raw_grad_theta, int t, int s float *raw_m = this->m, *raw_v = this->v; float alpha = lr*sqrt(1 - pow(this->beta_2, t_float)) / (1 - pow(this->beta_1, t_float)); +#ifndef _MSC_VER #pragma omp simd +#endif for (int i = start_idx; i < end_idx; ++i){ int index = sample_idx + i; raw_m[index] *= this->beta_1; diff --git a/gbrl/src/cpp/predictor.cpp b/gbrl/src/cpp/predictor.cpp index d8b8c76..cb3e63d 100755 --- a/gbrl/src/cpp/predictor.cpp +++ b/gbrl/src/cpp/predictor.cpp @@ -45,7 +45,9 @@ void Predictor::momentum_over_leaves(const float *obs, const char *categorical_o break; } if (passed){ - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < output_dim; ++d){ momentum[sample_idx + d] *= cv_beta; momentum[sample_idx + d] += cv_1_m_beta * values[leaf_idx * output_dim + d]; @@ -88,7 +90,9 @@ void Predictor::momentum_over_trees(const float *obs, const char *categorical_ob } int value_idx = (initial_leaf_idx + leaf_idx)*metadata->output_dim; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < metadata->output_dim; ++d){ momentum[sample_idx + d] *= cv_beta; momentum[sample_idx + d] += cv_1_m_beta * values[value_idx + d]; diff --git a/gbrl/src/cpp/split_candidate_generator.cpp b/gbrl/src/cpp/split_candidate_generator.cpp index 63acdcd..a4b070e 100755 --- a/gbrl/src/cpp/split_candidate_generator.cpp +++ b/gbrl/src/cpp/split_candidate_generator.cpp @@ -259,7 +259,9 @@ float scoreCosine(const int *indices, const int n_samples, const float *grads, c float *mean = new float[n_cols]; float n_samples_f = static_cast(n_samples); float n_samples_recip = 1.0f / n_samples_f; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ mean[d] = 0; } @@ -267,14 +269,18 @@ float scoreCosine(const int *indices, const int n_samples, const float *grads, c for (int i = 0; i < n_samples; ++i){ int idx = indices[i]; int row = idx * n_cols; - #pragma omp simd +#ifndef _MSC_VER + #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ mean[d] += grads[row + d]; } squared_norms += grads_norm_raw[idx]; } +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ mean[d] *= n_samples_recip; } @@ -290,19 +296,24 @@ float scoreL2(const int *indices, const int n_samples, const float *grads, const float n_samples_f = static_cast(n_samples); float n_samples_recip = 1.0f / n_samples_f; +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ mean[d] = 0.0f; } +#ifndef _MSC_VER #pragma omp simd +#endif for (int i = 0; i < n_samples * n_cols; ++i){ int row = i / n_cols; int col = i % n_cols; mean[col] += grads[indices[row]*n_cols + col]; } - // printf("l2 mean: ["); +#ifndef _MSC_VER #pragma omp simd +#endif for (int d = 0; d < n_cols; ++d){ mean[d] *= n_samples_recip; }