From d3d6e689001cf43931110aed86498de6c0ef14c6 Mon Sep 17 00:00:00 2001
From: benjamin <benjafuhrer@gmail.com>
Date: Wed, 5 Jun 2024 20:01:21 +0300
Subject: [PATCH] removed SIMD for MSVC

---
 CMakeLists.txt                             |   8 +-
 gbrl/src/cpp/CMakeLists.txt                |   3 +-
 gbrl/src/cpp/fitter.cpp                    |   8 +-
 gbrl/src/cpp/loss.cpp                      |   8 +-
 gbrl/src/cpp/math_ops.cpp                  | 192 ++++++++++++++++-----
 gbrl/src/cpp/math_ops.h                    |   6 +
 gbrl/src/cpp/node.cpp                      |  48 +++++-
 gbrl/src/cpp/optimizer.cpp                 |   4 +
 gbrl/src/cpp/predictor.cpp                 |   8 +-
 gbrl/src/cpp/split_candidate_generator.cpp |  15 +-
 10 files changed, 233 insertions(+), 67 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae6c4a9..8d72b4f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,11 +125,6 @@ if(APPLE)
       find_package(OpenMP REQUIRED)  
     endif()
 else()
-    # Set OpenMP runtime for MSVC
-    if (MSVC)
-        set(OpenMP_RUNTIME_MSVC experimental)
-        message(STATUS "OpenMP_RUNTIME_MSVC set to ${OpenMP_RUNTIME_MSVC}")
-    endif()
     find_package(OpenMP REQUIRED)
     set(OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS})
     set(OpenMP_CXX_LIB_NAMES ${OpenMP_CXX_LIB_NAMES})
@@ -197,7 +192,8 @@ elseif (WIN32)
     target_link_libraries(gbrl_cpp PRIVATE OpenMP::OpenMP_CXX)
     if (USE_CUDA)
         set(cuda_lib_path "${CUDAToolkit_ROOT_DIR}/lib/x64")
-        target_link_libraries(gbrl_cpp PRIVATE ${cuda_lib_path}/cudart.lib)
+        # target_link_libraries(gbrl_cpp PRIVATE ${cuda_lib_path}/cudart.lib)
+        target_link_libraries(gbrl_cpp PRIVATE CUDA::cudart)
     endif()
 endif()
 
diff --git a/gbrl/src/cpp/CMakeLists.txt b/gbrl/src/cpp/CMakeLists.txt
index e869e0f..487e6fb 100755
--- a/gbrl/src/cpp/CMakeLists.txt
+++ b/gbrl/src/cpp/CMakeLists.txt
@@ -33,8 +33,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
 elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++14 ${OpenMP_C_FLAGS} -Wall -Wpedantic -Wextra")
 elseif (WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++14 ${OpenMP_CXX_FLAGS} /O2 /W3 ")
-    message(STATUS ${OpenMP_CXX_FLAGS} )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /std:c++14 ${OpenMP_CXX_FLAGS} /W3")
 endif()
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
diff --git a/gbrl/src/cpp/fitter.cpp b/gbrl/src/cpp/fitter.cpp
index 136c087..6a355ce 100755
--- a/gbrl/src/cpp/fitter.cpp
+++ b/gbrl/src/cpp/fitter.cpp
@@ -177,7 +177,9 @@ float Fitter::fit_cpu(dataSet *dataset, const float* targets, ensembleData *edat
         if (metadata->split_score_func == L2){
             float *mean_grads = calculate_mean(build_grads, batch_n_samples, output_dim, par_th);
             float *std = calculate_var_and_center(build_grads, mean_grads, batch_n_samples, output_dim, par_th);
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = 0; i < output_dim; ++i)
                 std[i] = sqrtf(std[i]);
             divide_mat_by_vec_inplace(build_grads, std, batch_dataset.n_samples, metadata->output_dim, metadata->par_th);
@@ -527,7 +529,9 @@ void Fitter::calc_leaf_value(dataSet *dataset, ensembleData *edata, ensembleMeta
         }
         if (passed){
             idx = i*output_dim;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < output_dim; ++d)
                 edata->values[leaf_idx*output_dim + d] += grads[idx + d];
             count += 1;
diff --git a/gbrl/src/cpp/loss.cpp b/gbrl/src/cpp/loss.cpp
index 19bbde0..a554353 100755
--- a/gbrl/src/cpp/loss.cpp
+++ b/gbrl/src/cpp/loss.cpp
@@ -25,7 +25,9 @@ float MultiRMSE::get_loss_and_gradients(const float *raw_preds, const float *raw
         int thread_id = omp_get_thread_num();
         int start_idx = thread_id * elements_per_thread;
         int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = start_idx; i < end_idx; ++i){
             row = i / output_dim;
             col = i % output_dim;
@@ -55,7 +57,9 @@ float MultiRMSE::get_loss(const float *raw_preds, const float *raw_targets, cons
             int end_idx = (thread_id == n_threads - 1) ? n_samples : start_idx + samples_per_thread;
             for (int sample_idx = start_idx; sample_idx < end_idx; ++sample_idx){
                 row = sample_idx * output_dim;
-                #pragma omp simd 
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif 
                 for (int d = 0; d < output_dim; ++d){
                     grad_value = raw_preds[row + d] - raw_targets[row + d];
                     losses[thread_id] += (grad_value * grad_value);
diff --git a/gbrl/src/cpp/math_ops.cpp b/gbrl/src/cpp/math_ops.cpp
index 7962e6b..550ea13 100755
--- a/gbrl/src/cpp/math_ops.cpp
+++ b/gbrl/src/cpp/math_ops.cpp
@@ -25,14 +25,18 @@ void add_vec_to_mat(float *mat, const float *vec, const int n_samples, const int
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 mat[i] += vec[col]; 
             }
         }
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             mat[i] += vec[col]; 
@@ -51,14 +55,18 @@ void multiply_mat_by_vec_subtract_result(float *result, const float *mat, const
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 result[i] -= (mat[i]*vec[col]); 
             }
         }
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             result[i] -= (mat[i]*vec[col]); 
@@ -78,14 +86,18 @@ void divide_mat_by_vec_inplace(float *mat, const float *vec, const int n_samples
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 mat[i] /= (vec[col] + 1e-8f); 
             }
         }
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             mat[i] /= (vec[col] + 1e-8f);    
@@ -104,14 +116,18 @@ void subtract_vec_from_mat(float *mat, float *vec, const int n_samples, const in
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 mat[i] -= vec[col]; 
             }
         }
     } else {
-        #pragma omp simd
+        #ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             mat[i] -= vec[col];   
@@ -131,13 +147,17 @@ void multiply_mat_by_scalar(float *mat, float scalar, const int n_samples, const
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 mat[i] *= scalar; 
             }
         }
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             mat[i] *= scalar;  
         }
@@ -148,7 +168,9 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c
     int n_elements = n_samples * n_cols;
     float *mean = new float[n_cols];
     float n_samples_recip = 1.0f / static_cast<float>(n_samples);
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         mean[d] = 0.0f;
     int n_threads = calculate_num_threads(n_elements, par_th);
@@ -156,7 +178,9 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c
         omp_set_num_threads(n_threads);
         int elements_per_thread = (n_elements) / n_threads;
         float *thread_mean = new float[n_threads*n_cols];
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) 
             thread_mean[d] = 0.0f;
         #pragma omp parallel
@@ -164,27 +188,35 @@ float* calculate_mean(const float *mat, const int n_samples, const int n_cols, c
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 thread_mean[thread_id * n_cols + col] += mat[i];
             }
         }
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) {
             int col = d % n_cols;
             mean[col] += thread_mean[d];
         }
         delete[] thread_mean;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             mean[col] += mat[i]; 
         }
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         mean[d] *= n_samples_recip;
 
@@ -197,7 +229,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c
     float value;
 
     float *var = new float[n_cols];
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) {
         var[d] = 0;
     }
@@ -207,7 +241,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c
         int elements_per_thread = n_elements / n_threads;
         omp_set_num_threads(n_threads);
         float *thread_var = new float[n_threads*n_cols];
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d){
             thread_var[d] = 0;
         }
@@ -216,21 +252,27 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 value = mat[i] - mean[col];
                 thread_var[thread_id * n_cols + col] += (value * value);
             }
         }
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) {
             int col = d % n_cols;
             var[col] += thread_var[d];
         }
         delete[] thread_var;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             value = mat[i] - mean[col];
@@ -238,7 +280,9 @@ float* calculate_var(const float *mat, const float *mean, const int n_samples, c
         }
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) {
         var[d] *= n_samples_recip;
     }
@@ -250,7 +294,9 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in
     int n_elements = n_samples * n_cols;
     // assumes both matrices are centered
     float *cov = new float[n_cols];
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         cov[d] = 0.0f;
 
@@ -261,7 +307,9 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in
         int elements_per_thread = n_elements / n_threads;
         omp_set_num_threads(n_threads);
         float *thread_cov = new float[n_threads*n_cols];
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) 
             thread_cov[d] = 0.0f;
         #pragma omp parallel
@@ -269,27 +317,35 @@ float* calculate_row_covariance(const float *mat_l, const float *mat_r, const in
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 thread_cov[thread_id * n_cols + col] += (mat_l[i] * mat_r[i]);
             }
         }
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) {
             int col = d % n_cols;
             cov[col] += thread_cov[d];
         }
         delete[] thread_cov;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             cov[col] += ((mat_l[i] * mat_r[i]));  
         }
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         cov[d] *= n_samples_recip;
 
@@ -302,7 +358,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl
     float n_samples_recip = 1.0f / (static_cast<float>(n_samples) - 1.0f);
     float value;
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         var[d] = 0;
 
@@ -311,7 +369,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl
         int elements_per_thread = n_elements / n_threads;
         omp_set_num_threads(n_threads);
         float *thread_var = new float[n_threads*n_cols];
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) 
             thread_var[d] = 0.0f;
         #pragma omp parallel
@@ -319,7 +379,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 value = mat[i] - mean[col];
@@ -327,14 +389,18 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl
                 mat[i] -= mean[col]; 
             }
         }
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) {
             int col = d % n_cols;
             var[col] += thread_var[d];
         }
         delete[] thread_var;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             value = mat[i] - mean[col];
@@ -343,7 +409,9 @@ float* calculate_var_and_center(float *mat, const float *mean, const int n_sampl
         }
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         var[d] *= n_samples_recip;
 
@@ -356,7 +424,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl
     float n_samples_recip = 1.0f / (static_cast<float>(n_samples) - 1.0f);
     float value;
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d) 
         var[d] = 0;
 
@@ -365,7 +435,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl
         int elements_per_thread = n_elements / n_threads;
         omp_set_num_threads(n_threads);
         float *thread_var = new float[n_threads*n_cols];
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) 
             thread_var[d] = 0;
         #pragma omp parallel
@@ -373,7 +445,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 value = mat[i] - mean[col];
@@ -381,14 +455,18 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl
                 mat[i] -= mean[col]; 
             }
         }
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads*n_cols; ++d) {
             int col = d % n_cols;
             var[col] += thread_var[d];
         }
         delete[] thread_var;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             value = mat[i] - mean[col];
@@ -397,7 +475,9 @@ float* calculate_std_and_center(float *mat, const float *mean, const int n_sampl
         }
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d)
         var[d] = sqrtf(var[d] * n_samples_recip);
 
@@ -415,7 +495,9 @@ float* copy_mat(const float *mat, const int size, const int par_th){
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i)
                 copied_mat[i] = mat[i];
         }
@@ -436,7 +518,9 @@ void _element_wise_addition(float *mat_l, const float *mat_r, const int size, co
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i)
                 mat_l[i] += mat_r[i];
         }
@@ -457,7 +541,9 @@ float* element_wise_division(const float *mat_l, const float *mat_r, const int s
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i)
                 result[i] = mat_l[i] / (mat_r[i] + + 1e-8f);
         }
@@ -478,12 +564,16 @@ void set_zero_mat(float *mat, const int size, const int par_th){
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? size : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i)
                 mat[i] = 0.0f;
         }
      } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < size; ++i)
             mat[i] = 0.0f;
     }
@@ -516,21 +606,27 @@ float* calculate_max(const float *mat, const int n_samples, const int n_cols, co
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+ #ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 thread_max[thread_id * n_cols + col] = mat[i] > thread_max[thread_id * n_cols + col] ? mat[i] : thread_max[thread_id * n_cols + col] ;
             }
         }
 
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads * n_cols; ++d) {
             int col = d % n_cols;
             max[col] = max[col] > thread_max[d] ? max[col] : thread_max[d];
         }
         delete[] thread_max;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             max[col] = max[col] > mat[i] ? max[col] : mat[i];
@@ -559,21 +655,27 @@ float* calculate_min(const float *mat, const int n_samples, const int n_cols, co
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int col = i % n_cols;
                 thread_min[thread_id * n_cols + col] = mat[i] < thread_min[thread_id * n_cols + col] ? mat[i] : thread_min[thread_id * n_cols + col] ;
             }
         }
 
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_threads * n_cols; ++d) {
             int col = d % n_cols;
             min[col] = min[col] < thread_min[d] ? min[col] : thread_min[d];
         }
         delete[] thread_min;
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int col = i % n_cols;
             min[col] = min[col] < mat[i] ? min[col] : mat[i];
@@ -594,14 +696,18 @@ void calculate_squared_norm(float *norm, const float *mat, const int n_samples,
             int thread_id = omp_get_thread_num();
             int start_idx = thread_id * elements_per_thread;
             int end_idx = (thread_id == n_threads - 1) ? n_elements : start_idx + elements_per_thread;
-            #pragma omp simd
+ #ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int i = start_idx; i < end_idx; ++i) {
                 int row = i / n_cols;
                 norm[row] += mat[i]*mat[i];
             }
         }
     } else {
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int i = 0; i < n_elements; ++i) {
             int row = i / n_cols;
             norm[row] += mat[i]*mat[i];
diff --git a/gbrl/src/cpp/math_ops.h b/gbrl/src/cpp/math_ops.h
index ff86d91..c6ccae4 100755
--- a/gbrl/src/cpp/math_ops.h
+++ b/gbrl/src/cpp/math_ops.h
@@ -32,7 +32,9 @@ void calculate_squared_norm(float *norm, const float *mat, const int n_samples,
 
 inline float mat_vec_dot_sum(const int *indices, const float *grads, const float *vec, const int n_samples, const int n_cols){
     float sum = 0.0f;
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int i = 0; i < n_samples*n_cols; ++i){
         int row = i / n_cols;
         int col = i % n_cols;
@@ -43,7 +45,9 @@ inline float mat_vec_dot_sum(const int *indices, const float *grads, const float
 
 inline float norm(const float *vec, const int n_samples){
     float sum = 0.0f;
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int n = 0; n < n_samples; ++n){
         sum += (vec[n]*vec[n]);
     }
@@ -52,7 +56,9 @@ inline float norm(const float *vec, const int n_samples){
 
 inline float squared_norm(const float *vec, const int n_samples){
     float sum = 0.0f;
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int n = 0; n < n_samples; ++n){
         sum += (vec[n]*vec[n]);
     }
diff --git a/gbrl/src/cpp/node.cpp b/gbrl/src/cpp/node.cpp
index 93789aa..ea76bc1 100755
--- a/gbrl/src/cpp/node.cpp
+++ b/gbrl/src/cpp/node.cpp
@@ -185,7 +185,9 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo
     const int *sample_indices = this->sample_indices;
     float *left_mean = new float[n_cols]; 
     float *right_mean = new float[n_cols]; 
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] = 0;
         right_mean[d] = 0;
@@ -198,14 +200,18 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo
         sample_idx = sample_indices[n];
         grad_row = sample_idx*n_cols;
         if (obs[sample_idx*n_features + split_candidate.feature_idx] > split_candidate.feature_value){
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 right_mean[d] += grads[grad_row + d];
             right_norms += grads_norm[sample_idx];
             right_indices[right_count] = sample_idx;
             ++right_count;
         } else {
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 left_mean[d] += grads[grad_row + d];
             left_norms += grads_norm[sample_idx];
@@ -226,7 +232,9 @@ float TreeNode::splitScoreCosine(const float *obs, const float *grads, const flo
     float left_count_f = static_cast<float>(left_count), right_count_f = static_cast<float>(right_count);
     float left_count_recip = (left_count > 0 ) ? 1.0f / left_count_f : 0.0f;
     float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f;
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] *= left_count_recip;
         right_mean[d] *= right_count_recip;
@@ -252,7 +260,9 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads,
     const int *sample_indices = this->sample_indices;
     float *left_mean = new float[n_cols]; 
     float *right_mean = new float[n_cols]; 
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] = 0;
         right_mean[d] = 0;
@@ -265,14 +275,18 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads,
         sample_idx = sample_indices[n];
         grad_row = sample_idx*n_cols;
         if (strcmp(&obs[(sample_idx*n_features + split_candidate.feature_idx) * MAX_CHAR_SIZE],  split_candidate.categorical_value) == 0){
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 right_mean[d] += grads[grad_row + d];
             right_norms += grads_norm[sample_idx];
             right_indices[right_count] = sample_idx;
             ++right_count;
         } else {
-            #pragma omp simd
+         #ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 left_mean[d] += grads[grad_row + d];
             left_norms += grads_norm[sample_idx];
@@ -293,7 +307,9 @@ float TreeNode::splitScoreCosineCategorical(const char *obs, const float *grads,
     float left_count_f = static_cast<float>(left_count), right_count_f = static_cast<float>(right_count);
     float left_count_recip = (left_count > 0 ) ? 1.0f / left_count_f : 0.0f;
     float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f;
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] *= left_count_recip;
         right_mean[d] *= right_count_recip;
@@ -318,7 +334,9 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa
 
     float *left_mean = new float[n_cols]; 
     float *right_mean = new float[n_cols]; 
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] = 0;
         right_mean[d] = 0;
@@ -329,12 +347,16 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa
         sample_idx = sample_indices[n];
         grad_row = sample_idx*n_cols;
         if (obs[sample_idx*n_features + split_candidate.feature_idx] > split_candidate.feature_value){
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                     right_mean[d] += grads[grad_row + d];
             ++right_count;
         } else {
-            #pragma omp simd
+         #ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 left_mean[d] += grads[grad_row + d];
             ++left_count;
@@ -350,7 +372,9 @@ float TreeNode::splitScoreL2(const float *obs, const float *grads, const splitCa
     float left_count_f = static_cast<float>(left_count), right_count_f = static_cast<float>(right_count);
     float left_count_recip = (left_count > 0 ) ? 1.0f / left_count : 0.0f;
     float right_count_recip = (right_count > 0) ? 1.0f / right_count_f : 0.0f;
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] *= left_count_recip;
         right_mean[d] *= right_count_recip;
@@ -369,7 +393,9 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con
 
     float *left_mean = new float[n_cols]; 
     float *right_mean = new float[n_cols]; 
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] = 0;
         right_mean[d] = 0;
@@ -380,12 +406,16 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con
         sample_idx = sample_indices[n];
         grad_row = sample_idx*n_cols;
         if (strcmp(&obs[(sample_idx*n_features + split_candidate.feature_idx) * MAX_CHAR_SIZE], split_candidate.categorical_value) == 0){
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                     right_mean[d] += grads[grad_row + d];
             ++right_count;
         } else {
-            #pragma omp simd
+         #ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < n_cols; ++d)
                 left_mean[d] += grads[grad_row + d];
             ++left_count;
@@ -402,7 +432,9 @@ float TreeNode::splitScoreL2Categorical(const char *obs, const float *grads, con
     float left_count_f = static_cast<float>(left_count), right_count_f = static_cast<float>(right_count);
     float left_count_recip = (left_count > 0 ) ? 1.0f / left_count : 0.0f;
     float right_count_recip = (right_count > 0 ) ? 1.0f / right_count_f : 0.0f;
+ #ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         left_mean[d] *= left_count_recip;
         right_mean[d] *= right_count_recip;
diff --git a/gbrl/src/cpp/optimizer.cpp b/gbrl/src/cpp/optimizer.cpp
index 85ea47c..8c6af96 100755
--- a/gbrl/src/cpp/optimizer.cpp
+++ b/gbrl/src/cpp/optimizer.cpp
@@ -105,7 +105,9 @@ SGDOptimizer::SGDOptimizer(schedulerFunc schedule_func, float init_lr, float sto
 void SGDOptimizer::step(float *theta, const float *raw_grad_theta, int t, int sample_idx){
     int start_idx = this->start_idx, end_idx = this->end_idx;
     float lr = this->scheduler->get_lr(t);
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int i = start_idx; i < end_idx; i++){
         theta[sample_idx + i] -= lr * raw_grad_theta[i];
     }
@@ -259,7 +261,9 @@ void AdamOptimizer::step(float *theta, const float *raw_grad_theta, int t, int s
     float *raw_m = this->m, *raw_v = this->v;
     float alpha = lr*sqrt(1 - pow(this->beta_2, t_float)) / (1 - pow(this->beta_1, t_float));
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int i = start_idx; i < end_idx; ++i){
         int index = sample_idx + i;
         raw_m[index] *= this->beta_1; 
diff --git a/gbrl/src/cpp/predictor.cpp b/gbrl/src/cpp/predictor.cpp
index d8b8c76..cb3e63d 100755
--- a/gbrl/src/cpp/predictor.cpp
+++ b/gbrl/src/cpp/predictor.cpp
@@ -45,7 +45,9 @@ void Predictor::momentum_over_leaves(const float *obs, const char *categorical_o
                 break;
         }
         if (passed){
-            #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
             for (int d = 0; d < output_dim; ++d){
                 momentum[sample_idx + d] *= cv_beta;
                 momentum[sample_idx + d] += cv_1_m_beta * values[leaf_idx * output_dim + d];
@@ -88,7 +90,9 @@ void Predictor::momentum_over_trees(const float *obs, const char *categorical_ob
         }
         
         int value_idx = (initial_leaf_idx + leaf_idx)*metadata->output_dim;
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < metadata->output_dim; ++d){
             momentum[sample_idx + d] *= cv_beta;
             momentum[sample_idx + d] += cv_1_m_beta * values[value_idx + d];
diff --git a/gbrl/src/cpp/split_candidate_generator.cpp b/gbrl/src/cpp/split_candidate_generator.cpp
index 63acdcd..a4b070e 100755
--- a/gbrl/src/cpp/split_candidate_generator.cpp
+++ b/gbrl/src/cpp/split_candidate_generator.cpp
@@ -259,7 +259,9 @@ float scoreCosine(const int *indices, const int n_samples, const float *grads, c
     float *mean = new float[n_cols]; 
     float n_samples_f = static_cast<float>(n_samples);
     float n_samples_recip = 1.0f / n_samples_f;
+#ifndef _MSC_VER    
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         mean[d] = 0;
     }
@@ -267,14 +269,18 @@ float scoreCosine(const int *indices, const int n_samples, const float *grads, c
     for (int i = 0; i < n_samples; ++i){
         int idx = indices[i];
         int row = idx * n_cols;
-        #pragma omp simd
+#ifndef _MSC_VER
+    #pragma omp simd
+#endif
         for (int d = 0; d < n_cols; ++d){
             mean[d] += grads[row + d];
         }
         squared_norms += grads_norm_raw[idx];
     }
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         mean[d] *= n_samples_recip;
     }
@@ -290,19 +296,24 @@ float scoreL2(const int *indices, const int n_samples, const float *grads, const
     float n_samples_f = static_cast<float>(n_samples);
     float n_samples_recip = 1.0f / n_samples_f;
 
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         mean[d] = 0.0f;
     }
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int i = 0; i < n_samples * n_cols; ++i){
         int row = i / n_cols;
         int col = i % n_cols;
         mean[col] += grads[indices[row]*n_cols + col];
     }
     
-    // printf("l2 mean: [");
+#ifndef _MSC_VER
     #pragma omp simd
+#endif
     for (int d = 0; d < n_cols; ++d){
         mean[d] *= n_samples_recip;
     }