DevashishLal-CB
diff --git a/‎.github/scripts/utils_triton.bash‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/utils_triton.bash‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc‎
Lines changed: 93 additions & 0 deletions b/‎bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎defs.bzl‎
Lines changed: 17 additions & 0 deletions b/‎defs.bzl‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 13 additions & 4 deletions b/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_forward_split.py‎
Lines changed: 8 additions & 1 deletion b/‎fbgemm_gpu/codegen/genscript/generate_forward_split.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 43 additions & 2 deletions b/‎fbgemm_gpu/codegen/genscript/optimizer_args.py‎
Lines changed: 43 additions & 2 deletions
@@ -87,7 +87,7 @@ install_triton_pip () {
   # https://github.com/pytorch/pytorch/commits/main/.ci/docker/ci_commit_pins/triton.txt
   # https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/triton.txt
   # https://github.com/pytorch/pytorch/pull/126098
-  local triton_version="nightly/3.2.0+git0d4682f0"
+  local triton_version="nightly/3.2.0+git4b3bb1f8"
 
   # BUILD_VARIANT is provided by the github workflow file
   if [ "$BUILD_VARIANT" == "cuda" ] || [ "$BUILD_VARIANT" == "genai" ]; then
 
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <chrono>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "./BenchUtils.h"
+#include "fbgemm/QuantUtils.h"
+#include "fbgemm/Types.h"
+
+using namespace std;
+using namespace fbgemm;
+
+// T is the type of scale and bias
+template <typename T>
+void performance_test() {
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 256;
+
+  if (is_same<T, float16>::value) {
+    cout << "With result as float16" << endl;
+  } else {
+    cout << "With result as float" << endl;
+  }
+  cout << setw(6) << "rows" << "," << setw(6) << "cols" << "," << setw(16)
+       << "elems_per_usec" << "," << setw(10) << "GB/Sec" << endl;
+
+  for (int rowSize : {100, 120, 1000}) {
+    for (int colSize : {16, 64, 128, 256, 512, 1024, 2048}) {
+      aligned_vector<uint8_t> inpVec(rowSize * colSize);
+      randFill<uint8_t>(inpVec, 0, 20);
+
+      int out_emb_cols = colSize;
+
+      if (is_same<T, float16>::value) {
+        out_emb_cols /= 2;
+      }
+      int outVecSize = rowSize * (out_emb_cols + 2 * sizeof(T));
+      aligned_vector<T> outVec(outVecSize);
+
+      double duration = 0.0f;
+
+      duration = measureWithWarmup(
+          [&]() {
+            Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf(
+                inpVec.data(), rowSize, colSize, outVec.data());
+          },
+          NWARMUP,
+          NITER,
+          [&]() {
+            cache_evict(inpVec);
+            cache_evict(outVec);
+          });
+
+      float elements_per_usec = rowSize * colSize / (duration * 1e6);
+
+      duration *= 1e9; // convert to ns
+      long bytes_read = rowSize * colSize * sizeof(float);
+      float gigabyes_per_sec = bytes_read / duration;
+
+      cout << setw(6) << rowSize << ", " << setw(6) << colSize << ",";
+      cout << setw(16) << std::fixed << std::setprecision(2)
+           << elements_per_usec << ", ";
+      cout << setw(10) << std::fixed << std::setprecision(2) << gigabyes_per_sec
+           << endl;
+    } // for each cols
+  } // for each rows
+} // performance_test
+
+int main() {
+#ifdef _OPENMP
+  // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
+#endif
+  performance_test<float16>();
+  performance_test<float>();
+  return 0;
+}
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 # All rights reserved.
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -73,6 +74,7 @@ def get_fbgemm_public_headers():
         "include/fbgemm/QuantUtils.h",
         "include/fbgemm/QuantUtilsAvx2.h",
         "include/fbgemm/QuantUtilsAvx512.h",
+        "include/fbgemm/QuantUtilsNeon.h",
         "include/fbgemm/spmmUtils.h",
         "include/fbgemm/spmmUtilsAvx2.h",
         "include/fbgemm/SimdUtils.h",
@@ -153,6 +155,7 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
     intrinsics_srcs = [
         "src/FbgemmFP16UKernelsSve128.cc",
         "src/KleidiAIFP16UKernelsNeon.cc",
+        "src/QuantUtilsNeon.cc",
         "src/UtilsSve.cc",
     ] + select({
         "DEFAULT": [],
@@ -165,6 +168,7 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
     asm_srcs = [
         "src/FbgemmFP16UKernelsSve128.cc",
         "src/KleidiAIFP16UKernelsNeon.cc",
+        "src/QuantUtilsNeon.cc",
         "src/UtilsSve.cc",
     ] + select({
         "DEFAULT": [],
@@ -180,6 +184,19 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
         })
     return asm_srcs if not msvc else intrinsics_srcs
 
+def get_fbgemm_inline_neon_srcs(msvc = False, buck = False):
+    intrinsics_srcs = ["src/UtilsNeon.cc"]
+
+    #FP16 kernels contain inline assembly and inline assembly syntax for MSVC is different.
+    asm_srcs = ["src/UtilsNeon.cc"]
+    if buck:
+        return select({
+            "DEFAULT": asm_srcs,
+            "ovr_config//compiler:cl": intrinsics_srcs,
+            "ovr_config//cpu:arm64": intrinsics_srcs,
+        })
+    return asm_srcs if not msvc else intrinsics_srcs
+
 def get_fbgemm_autovec_srcs():
     return [
         "src/EmbeddingSpMDMAutovec.cc",
 
@@ -23,7 +23,7 @@
     #  in `deeplearning.fbgemm.fbgemm_gpu.codegen.genscript.optimizers`.
     from .optimizers import *
     from .common import CodeTemplate
-    from .optimizer_args import OptimizerArgsSet
+    from .optimizer_args import annotation_dict, OptimizerArgsSet
     from .scripts_argsparse import args
 except ImportError:
     from optimizers import *
@@ -32,7 +32,7 @@
     from common import CodeTemplate
 
     # pyre-ignore[21]
-    from optimizer_args import OptimizerArgsSet
+    from optimizer_args import annotation_dict, OptimizerArgsSet
 
     # pyre-ignore[21]
     from scripts_argsparse import args
@@ -187,7 +187,11 @@ def generate_backward_split_gpu(**kwargs: Any) -> None:
                     ),
                 ]:
                     CodeTemplate.load(template_filepath).write(
-                        filename, is_forward=False, ssd=ssd, **kwargs
+                        filename,
+                        is_forward=False,
+                        ssd=ssd,
+                        schema_annotation=annotation_dict,
+                        **kwargs,
                     )
 
                 if kwargs.get("has_cpu_support") or kwargs.get("has_gpu_support"):
@@ -242,7 +246,10 @@ def generate_backward_split_cpu(**kwargs: Any) -> None:
                 ),
             ]:
                 CodeTemplate.load(template_filepath).write(
-                    filename, is_forward=False, **kwargs
+                    filename,
+                    is_forward=False,
+                    schema_annotation=annotation_dict,
+                    **kwargs,
                 )
 
     @staticmethod
@@ -406,6 +413,8 @@ def generate() -> None:
             ],
             "aux_int": [
                 "iter",  # 0
+                "info_B_num_bits",  # 1
+                "info_B_mask",  # 2
             ],
             "aux_float": [
                 "gwd_lower_bound",  # 0
 
@@ -15,10 +15,14 @@
 
 try:
     from .common import CodeTemplate
+    from .optimizer_args import annotation_dict
 except ImportError:
     # pyre-ignore[21]
     from common import CodeTemplate
 
+    # pyre-ignore[21]
+    from optimizer_args import annotation_dict
+
 
 class ForwardSplitGenerator:
     @staticmethod
@@ -74,6 +78,7 @@ def generate_pt2_wrappers() -> None:
             has_gpu_support=True,
             is_forward=True,
             has_vbe_support=True,
+            schema_annotation=annotation_dict,
         )
 
         # Generate PT2 forward wrapper (CPU)
@@ -84,9 +89,10 @@ def generate_pt2_wrappers() -> None:
             has_cpu_support=True,
             is_forward=True,
             has_vbe_support=True,
+            schema_annotation=annotation_dict,
         )
 
-        # Generate PT2 forward wrapper (CUDA)
+        # Generate SSD PT2 forward wrapper (CUDA)
         CodeTemplate.load(
             "training/pt2/embedding_split_host_pt2_cuda_wrapper_template.cpp",
         ).write(
@@ -95,6 +101,7 @@ def generate_pt2_wrappers() -> None:
             is_forward=True,
             has_vbe_support=True,
             ssd=True,
+            schema_annotation=annotation_dict,
         )
 
     @staticmethod
 
@@ -47,6 +47,41 @@ class OptimizerArgsSetItem:
 OptimItem = OptimizerArgsSetItem
 
 
+######################################################################
+## Data Dict for the code generator script                   ##
+######################################################################
+# a dict of tensor name and annotation to mark whether the tensor is mutable.
+# this is use to annotate the tensor in the defintion schema.
+annotation_dict: Dict[str, str] = {
+    "weights": "(a!)",
+    "weights_host": "(a!)",
+    "weights_dev": "(b!)",
+    "weights_uvm": "(c!)",
+    "weights_lxu_cache": "(d!)",
+    "aux_tensor": "(e!)",
+    "uvm_cache_stats": "(f!)",
+    "momentum1": "(g!)",
+    "momentum1_host": "(g!)",
+    "momentum1_dev": "(h!)",
+    "momentum1_uvm": "(i!)",
+    "momentum2": "(j!)",
+    "momentum2_host": "(j!)",
+    "momentum2_dev": "(k!)",
+    "momentum2_uvm": "(l!)",
+    "prev_iter": "(m!)",
+    "prev_iter_host": "(m!)",
+    "prev_iter_dev": "(n!)",
+    "prev_iter_uvm": "(o!)",
+    "row_counter": "(p!)",
+    "row_counter_host": "(p!)",
+    "row_counter_dev": "(q!)",
+    "row_counter_uvm": "(r!)",
+    "optim_tensor": "(s!)",
+    "delta_weights_host": "(t!)",
+    "delta_weights_dev": "(u!)",
+    "delta_weights_uvm": "(v!)",
+}
+
 ######################################################################
 ## Helper functions for the code generator script                   ##
 ######################################################################
@@ -146,6 +181,11 @@ def tensor_arg(name: str) -> str:
     return f"Tensor {name}"
 
 
+def tensor_arg_annotate(name: str) -> str:
+    annotate = annotation_dict[name] if name in annotation_dict else ""
+    return f"Tensor{annotate} {name}"
+
+
 def double_arg(name: str, default: float = 0.0) -> str:
     return f"double {name} = {default}"
 
@@ -191,7 +231,8 @@ def schema_sym_int_arg_no_default(name: str) -> str:
 
 
 def schema_tensor_list_arg_no_default(name: str) -> str:
-    return f"Tensor[] {name}"
+    annotate = annotation_dict[name] if name in annotation_dict else ""
+    return f"Tensor[]{annotate} {name}"
 
 
 def bool_arg(name: str, default: bool = False) -> str:
@@ -409,7 +450,7 @@ def make_function_arg(
 
 def make_function_schema_arg(ty: ArgType, name: str, default: Union[int, float]) -> str:
     return {
-        ArgType.TENSOR: tensor_arg,
+        ArgType.TENSOR: tensor_arg_annotate,
         ArgType.INT_TENSOR: tensor_arg,
         ArgType.LONG_TENSOR: tensor_arg,
         ArgType.PLACEHOLDER_TENSOR: tensor_arg,