intel · SinpackKonmakan · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
@@ -15,7 +15,18 @@
 import torch
 from .utils import round_ste, reshape_pad_tensor_by_group_size, revert_tensor_by_pad
 from auto_round.data_type.register import register_dtype
-
+import numpy as np
+from concurrent.futures import ProcessPoolExecutor
+QK_K = 256
+K_SCALE_SIZE = 12
+GGML_QUANT_SIZES = {
+    "bf16": (1, 2),
+    "q4_0": (32, 2 + 16),
+    "q4_1": (32, 2 + 2 + 16),
+    "q4_k": (256, 2 + 2 + QK_K//2 + 12),
+    "q2_k": (256, 2 + 2 + QK_K//16 + QK_K//4),
+    "q8_0": (32, 2 + 32)
+}
 
 @register_dtype("int_sym")
 def quant_tensor_sym(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, scale_dtype=torch.float16,
@@ -72,6 +83,62 @@ def double_quant_tensor(tensor, bits, q_scale_thresh):
     qdq_tensor = torch.clamp(round_ste(tensor / scale), max=maxq) * scale
     return qdq_tensor, scale
 
+def make_qkx2_quants(data, weight, nmax, group_size, rmin=-1, rdelta=0.1, nstep=20, use_mad=False):
+    group_min = np.min(data)
+    group_max = np.max(data)
+
+    sum_w = np.sum(weight)
+    sum_x = np.sum(weight * data)
+
+    group_min = min(group_min, 0)
+    if group_min == group_max:
+        L = np.zeros(group_size, dtype=np.uint8)
+        the_min = -group_min
+        return 0.0, L, the_min
+
+    iscale = nmax / (group_max - group_min)
+    scale = 1 / iscale
+
+    l_values = np.round(iscale * (data-group_min))
+    L = np.clip(l_values, 0, nmax).astype(np.uint8)
+
+    diffs = scale * L + group_min - data
+    diffs = np.abs(diffs) if use_mad else diffs**2
+    best_mad = np.sum(weight * diffs)
+
+    if nstep < 1:
+        the_min = -group_min
+        return scale, L, the_min
+
+    for step in range(nstep):
+        iscale = (rmin + rdelta * step + nmax) / (group_max - group_min)
+        l_values = np.round(iscale * (data - group_min))
+        Laux = np.clip(l_values, 0, nmax).astype(np.uint8)
+
+        sum_l = np.sum(weight * Laux)
+        sum_l2 = np.sum(weight * Laux**2)
+        sum_xl = np.sum(weight * Laux * data)
+
+        D = sum_w * sum_l2 - sum_l * sum_l
+        if D > 0:
+            this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
+            this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
+            if this_min > 0:
+                this_min = 0
+                this_scale = sum_xl / sum_l2
+
+            diffs = this_scale * Laux + this_min - data
+            diffs = np.abs(diffs) if use_mad else diffs**2
+            mad = np.sum(weight * diffs)
+
+            if mad < best_mad:
+                L = Laux.copy()
+                best_mad = mad
+                scale = this_scale
+                group_min = this_min
+
+    the_min = -group_min
+    return scale, L, the_min
 
 @register_dtype("int_asym_dq")
 def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, scale_dtype=torch.float16,
@@ -109,7 +176,9 @@ def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_
     else:
         wmin = wmin_tmp
         wmax = wmax_tmp
-    scale = ((wmax - wmin) / maxq).to(scale_dtype)
+    scale = quant_tensor_k_quant_cuda(tensor)
+    scale = scale.squeeze(-1)
+    scale = torch.from_numpy(scale).to(tensor.dtype).cuda()
     scale = torch.clamp(scale, min=q_scale_thresh)
     scale = scale.view(-1, super_group_size)
     wmin_m = -wmin  # pylint: disable=E1130
@@ -130,6 +199,86 @@ def quant_tensor_asym_dq(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_
     zp = round_ste(wmin_m / scale)  # remove this later
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin_m": wmin_m, "d_wmin_m": d_wmin_m}
 
+def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    try:
+        import cupy as cp
+        import torch
+
+        if torch.cuda.is_available():
+            data = cp.asarray(data)
+            data = data.reshape((-1, group_size)).astype(cp.float32)  # nb = data.shape[0], (nb, group_size)
+            maxq = 2**num_bits - 1
+            minq = 0
+            sum_x2 = cp.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+            av_x = cp.sqrt(sum_x2 / group_size)  # (nb, 1)
+            weights = cp.add(av_x, cp.abs(data))  # (nb, group_size)
+            rmin = cp.min(data, axis=1, keepdims=True)  # (nb, 1)
+            rmax = cp.max(data, axis=1, keepdims=True)  # (nb, 1)
+            sum_w = cp.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+            sum_x = cp.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+            iscale = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+            mask = rmin != rmax
+            iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+            scale = 1 / iscale
+            quant_data = cp.clip(cp.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+            diff = scale * quant_data + rmin - data  # (nb, group_size)
+            best_mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+            nstep = 20
+            rdelta = 0.1
+            rrmin = -1
+            for is_ in range(nstep):
+                iscale_new = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+                factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+                mask = rmin != rmax
+                iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+                quant_data_new = cp.clip(cp.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+                mul_weights_quant_data_new = weights * quant_data_new
+                sum_l = cp.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_l2 = cp.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_xl = cp.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+                D = cp.subtract(sum_w * sum_l2, sum_l**2)  # (nb, 1)
+
+                this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+                this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+                diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+                mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+                mad_1 = cp.array(mad)
+                best_mad_1 = cp.array(best_mad)
+                idx_to_replace = cp.where(mad_1 < best_mad_1)[0]
+                quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+                best_mad[idx_to_replace] = mad[idx_to_replace]
+                scale[idx_to_replace] = this_scale[idx_to_replace]
+                rmin[idx_to_replace] = this_min[idx_to_replace]
+
+            scale = scale.astype(cp.float64)
+
+            return scale.get()
+        else:
+            logger.warning(
+                "Try to use k-quant quantization on CUDA. However, CUDA is not available."
+                "Fall back to k-quant quantization on CPU."
+            )
+            return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+    except ImportError:
+        logger.info(
+            "Now we are using k-quant quantization on cpu, which is time consuming."
+            "Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
+            "Please also install torch to check CUDA availability."
+        )
+        return quant_tensor_k_quant_cpu(data, num_bits, group_size)
 
 @register_dtype("int_asym")
 def quant_tensor_asym(tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, scale_dtype=torch.float16,