@@ -263,7 +263,7 @@ static inline __m512 set_fp4_lut() {
263263// DATA_TYPE: 1 = FP4, 2 = NF4
264264template <typename T, int DATA_TYPE >
265265void dequantizeBlockwise4bitCpu (
266- unsigned char * __restrict A, const float * __restrict absmax, T* __restrict out, long long blocksize, long long m, long long n
266+ unsigned char * A, const float * absmax, T* out, long long blocksize, long long m, long long n
267267) {
268268 static_assert (DATA_TYPE == 1 || DATA_TYPE == 2 , " dequantizeBlockwise4bitCpu called with non 4-bit DATA_TYPE" );
269269 if (blocksize <= 0 || m < 0 || n <= 0 )
@@ -408,7 +408,7 @@ void dequantizeBlockwise4bitCpu(
408408
409409template <typename T>
410410void dequantizeBlockwise8bitCpu (
411- float * __restrict code, unsigned char * __restrict A, const float * __restrict absmax, T* __restrict out, long long blocksize, long long n
411+ float * code, unsigned char * A, const float * absmax, T* out, long long blocksize, long long n
412412) {
413413 if (blocksize <= 0 || n <= 0 )
414414 return ;
@@ -418,6 +418,9 @@ void dequantizeBlockwise8bitCpu(
418418 long long valid_items = (n - block_idx >= blocksize ? blocksize : n - block_idx);
419419 long long block_end = block_idx + valid_items;
420420 float scale = absmax[block_idx / blocksize];
421+ #ifdef _MSC_VER
422+ #pragma loop(ivdep)
423+ #endif
421424 for (long long i = block_idx; i < block_end; ++i) {
422425 float v = code[A[i]] * scale;
423426 if constexpr (std::is_same<T, bf16_t >::value) {
@@ -518,7 +521,7 @@ static inline uint16_t norm_to_lut_index(float val) {
518521}
519522
520523template <typename T>
521- void quantize_cpu_impl (float * __restrict code, const T* __restrict A, float * __restrict absmax, unsigned char * __restrict out, long long blocksize, long long n) {
524+ void quantize_cpu_impl (float * code, const T* A, float * absmax, unsigned char * out, long long blocksize, long long n) {
522525 if (blocksize <= 0 || n <= 0 )
523526 return ;
524527
0 commit comments