Skip to content

Commit 5d9bd56

Browse files
Merge branch 'main' into dev/dlal/fix-header-compilation
2 parents ab3f389 + 0c6d683 commit 5d9bd56

93 files changed

Lines changed: 4577 additions & 1127 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/scripts/utils_triton.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ install_triton_pip () {
8787
# https://github.com/pytorch/pytorch/commits/main/.ci/docker/ci_commit_pins/triton.txt
8888
# https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/triton.txt
8989
# https://github.com/pytorch/pytorch/pull/126098
90-
local triton_version="nightly/3.2.0+git0d4682f0"
90+
local triton_version="nightly/3.2.0+git4b3bb1f8"
9191

9292
# BUILD_VARIANT is provided by the github workflow file
9393
if [ "$BUILD_VARIANT" == "cuda" ] || [ "$BUILD_VARIANT" == "genai" ]; then
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <chrono>
10+
#include <initializer_list>
11+
#include <iomanip>
12+
#include <iostream>
13+
#include <vector>
14+
15+
#ifdef _OPENMP
16+
#include <omp.h>
17+
#endif
18+
19+
#include "./BenchUtils.h"
20+
#include "fbgemm/QuantUtils.h"
21+
#include "fbgemm/Types.h"
22+
23+
using namespace std;
24+
using namespace fbgemm;
25+
26+
// T is the type of scale and bias
27+
template <typename T>
28+
void performance_test() {
29+
constexpr int NWARMUP = 4;
30+
constexpr int NITER = 256;
31+
32+
if (is_same<T, float16>::value) {
33+
cout << "With result as float16" << endl;
34+
} else {
35+
cout << "With result as float" << endl;
36+
}
37+
cout << setw(6) << "rows" << "," << setw(6) << "cols" << "," << setw(16)
38+
<< "elems_per_usec" << "," << setw(10) << "GB/Sec" << endl;
39+
40+
for (int rowSize : {100, 120, 1000}) {
41+
for (int colSize : {16, 64, 128, 256, 512, 1024, 2048}) {
42+
aligned_vector<uint8_t> inpVec(rowSize * colSize);
43+
randFill<uint8_t>(inpVec, 0, 20);
44+
45+
int out_emb_cols = colSize;
46+
47+
if (is_same<T, float16>::value) {
48+
out_emb_cols /= 2;
49+
}
50+
int outVecSize = rowSize * (out_emb_cols + 2 * sizeof(T));
51+
aligned_vector<T> outVec(outVecSize);
52+
53+
double duration = 0.0f;
54+
55+
duration = measureWithWarmup(
56+
[&]() {
57+
Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf(
58+
inpVec.data(), rowSize, colSize, outVec.data());
59+
},
60+
NWARMUP,
61+
NITER,
62+
[&]() {
63+
cache_evict(inpVec);
64+
cache_evict(outVec);
65+
});
66+
67+
float elements_per_usec = rowSize * colSize / (duration * 1e6);
68+
69+
duration *= 1e9; // convert to ns
70+
long bytes_read = rowSize * colSize * sizeof(float);
71+
float gigabyes_per_sec = bytes_read / duration;
72+
73+
cout << setw(6) << rowSize << ", " << setw(6) << colSize << ",";
74+
cout << setw(16) << std::fixed << std::setprecision(2)
75+
<< elements_per_usec << ", ";
76+
cout << setw(10) << std::fixed << std::setprecision(2) << gigabyes_per_sec
77+
<< endl;
78+
} // for each cols
79+
} // for each rows
80+
} // performance_test
81+
82+
int main() {
83+
#ifdef _OPENMP
84+
// Use 1 thread unless OMP_NUM_THREADS is explicit set.
85+
const char* val = getenv("OMP_NUM_THREADS");
86+
if (val == nullptr || !*val) {
87+
omp_set_num_threads(1);
88+
}
89+
#endif
90+
performance_test<float16>();
91+
performance_test<float>();
92+
return 0;
93+
}

defs.bzl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
# All rights reserved.
34
# This source code is licensed under the BSD-style license found in the
45
# LICENSE file in the root directory of this source tree.
@@ -73,6 +74,7 @@ def get_fbgemm_public_headers():
7374
"include/fbgemm/QuantUtils.h",
7475
"include/fbgemm/QuantUtilsAvx2.h",
7576
"include/fbgemm/QuantUtilsAvx512.h",
77+
"include/fbgemm/QuantUtilsNeon.h",
7678
"include/fbgemm/spmmUtils.h",
7779
"include/fbgemm/spmmUtilsAvx2.h",
7880
"include/fbgemm/SimdUtils.h",
@@ -153,6 +155,7 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
153155
intrinsics_srcs = [
154156
"src/FbgemmFP16UKernelsSve128.cc",
155157
"src/KleidiAIFP16UKernelsNeon.cc",
158+
"src/QuantUtilsNeon.cc",
156159
"src/UtilsSve.cc",
157160
] + select({
158161
"DEFAULT": [],
@@ -165,6 +168,7 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
165168
asm_srcs = [
166169
"src/FbgemmFP16UKernelsSve128.cc",
167170
"src/KleidiAIFP16UKernelsNeon.cc",
171+
"src/QuantUtilsNeon.cc",
168172
"src/UtilsSve.cc",
169173
] + select({
170174
"DEFAULT": [],
@@ -180,6 +184,19 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
180184
})
181185
return asm_srcs if not msvc else intrinsics_srcs
182186

187+
def get_fbgemm_inline_neon_srcs(msvc = False, buck = False):
188+
intrinsics_srcs = ["src/UtilsNeon.cc"]
189+
190+
#FP16 kernels contain inline assembly and inline assembly syntax for MSVC is different.
191+
asm_srcs = ["src/UtilsNeon.cc"]
192+
if buck:
193+
return select({
194+
"DEFAULT": asm_srcs,
195+
"ovr_config//compiler:cl": intrinsics_srcs,
196+
"ovr_config//cpu:arm64": intrinsics_srcs,
197+
})
198+
return asm_srcs if not msvc else intrinsics_srcs
199+
183200
def get_fbgemm_autovec_srcs():
184201
return [
185202
"src/EmbeddingSpMDMAutovec.cc",

fbgemm_gpu/codegen/genscript/generate_backward_split.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# in `deeplearning.fbgemm.fbgemm_gpu.codegen.genscript.optimizers`.
2424
from .optimizers import *
2525
from .common import CodeTemplate
26-
from .optimizer_args import OptimizerArgsSet
26+
from .optimizer_args import annotation_dict, OptimizerArgsSet
2727
from .scripts_argsparse import args
2828
except ImportError:
2929
from optimizers import *
@@ -32,7 +32,7 @@
3232
from common import CodeTemplate
3333

3434
# pyre-ignore[21]
35-
from optimizer_args import OptimizerArgsSet
35+
from optimizer_args import annotation_dict, OptimizerArgsSet
3636

3737
# pyre-ignore[21]
3838
from scripts_argsparse import args
@@ -187,7 +187,11 @@ def generate_backward_split_gpu(**kwargs: Any) -> None:
187187
),
188188
]:
189189
CodeTemplate.load(template_filepath).write(
190-
filename, is_forward=False, ssd=ssd, **kwargs
190+
filename,
191+
is_forward=False,
192+
ssd=ssd,
193+
schema_annotation=annotation_dict,
194+
**kwargs,
191195
)
192196

193197
if kwargs.get("has_cpu_support") or kwargs.get("has_gpu_support"):
@@ -242,7 +246,10 @@ def generate_backward_split_cpu(**kwargs: Any) -> None:
242246
),
243247
]:
244248
CodeTemplate.load(template_filepath).write(
245-
filename, is_forward=False, **kwargs
249+
filename,
250+
is_forward=False,
251+
schema_annotation=annotation_dict,
252+
**kwargs,
246253
)
247254

248255
@staticmethod
@@ -406,6 +413,8 @@ def generate() -> None:
406413
],
407414
"aux_int": [
408415
"iter", # 0
416+
"info_B_num_bits", # 1
417+
"info_B_mask", # 2
409418
],
410419
"aux_float": [
411420
"gwd_lower_bound", # 0

fbgemm_gpu/codegen/genscript/generate_forward_split.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,14 @@
1515

1616
try:
1717
from .common import CodeTemplate
18+
from .optimizer_args import annotation_dict
1819
except ImportError:
1920
# pyre-ignore[21]
2021
from common import CodeTemplate
2122

23+
# pyre-ignore[21]
24+
from optimizer_args import annotation_dict
25+
2226

2327
class ForwardSplitGenerator:
2428
@staticmethod
@@ -74,6 +78,7 @@ def generate_pt2_wrappers() -> None:
7478
has_gpu_support=True,
7579
is_forward=True,
7680
has_vbe_support=True,
81+
schema_annotation=annotation_dict,
7782
)
7883

7984
# Generate PT2 forward wrapper (CPU)
@@ -84,9 +89,10 @@ def generate_pt2_wrappers() -> None:
8489
has_cpu_support=True,
8590
is_forward=True,
8691
has_vbe_support=True,
92+
schema_annotation=annotation_dict,
8793
)
8894

89-
# Generate PT2 forward wrapper (CUDA)
95+
# Generate SSD PT2 forward wrapper (CUDA)
9096
CodeTemplate.load(
9197
"training/pt2/embedding_split_host_pt2_cuda_wrapper_template.cpp",
9298
).write(
@@ -95,6 +101,7 @@ def generate_pt2_wrappers() -> None:
95101
is_forward=True,
96102
has_vbe_support=True,
97103
ssd=True,
104+
schema_annotation=annotation_dict,
98105
)
99106

100107
@staticmethod

fbgemm_gpu/codegen/genscript/optimizer_args.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,41 @@ class OptimizerArgsSetItem:
4747
OptimItem = OptimizerArgsSetItem
4848

4949

50+
######################################################################
51+
## Data Dict for the code generator script ##
52+
######################################################################
53+
# a dict of tensor name and annotation to mark whether the tensor is mutable.
54+
# this is use to annotate the tensor in the defintion schema.
55+
annotation_dict: Dict[str, str] = {
56+
"weights": "(a!)",
57+
"weights_host": "(a!)",
58+
"weights_dev": "(b!)",
59+
"weights_uvm": "(c!)",
60+
"weights_lxu_cache": "(d!)",
61+
"aux_tensor": "(e!)",
62+
"uvm_cache_stats": "(f!)",
63+
"momentum1": "(g!)",
64+
"momentum1_host": "(g!)",
65+
"momentum1_dev": "(h!)",
66+
"momentum1_uvm": "(i!)",
67+
"momentum2": "(j!)",
68+
"momentum2_host": "(j!)",
69+
"momentum2_dev": "(k!)",
70+
"momentum2_uvm": "(l!)",
71+
"prev_iter": "(m!)",
72+
"prev_iter_host": "(m!)",
73+
"prev_iter_dev": "(n!)",
74+
"prev_iter_uvm": "(o!)",
75+
"row_counter": "(p!)",
76+
"row_counter_host": "(p!)",
77+
"row_counter_dev": "(q!)",
78+
"row_counter_uvm": "(r!)",
79+
"optim_tensor": "(s!)",
80+
"delta_weights_host": "(t!)",
81+
"delta_weights_dev": "(u!)",
82+
"delta_weights_uvm": "(v!)",
83+
}
84+
5085
######################################################################
5186
## Helper functions for the code generator script ##
5287
######################################################################
@@ -146,6 +181,11 @@ def tensor_arg(name: str) -> str:
146181
return f"Tensor {name}"
147182

148183

184+
def tensor_arg_annotate(name: str) -> str:
185+
annotate = annotation_dict[name] if name in annotation_dict else ""
186+
return f"Tensor{annotate} {name}"
187+
188+
149189
def double_arg(name: str, default: float = 0.0) -> str:
150190
return f"double {name} = {default}"
151191

@@ -191,7 +231,8 @@ def schema_sym_int_arg_no_default(name: str) -> str:
191231

192232

193233
def schema_tensor_list_arg_no_default(name: str) -> str:
194-
return f"Tensor[] {name}"
234+
annotate = annotation_dict[name] if name in annotation_dict else ""
235+
return f"Tensor[]{annotate} {name}"
195236

196237

197238
def bool_arg(name: str, default: bool = False) -> str:
@@ -409,7 +450,7 @@ def make_function_arg(
409450

410451
def make_function_schema_arg(ty: ArgType, name: str, default: Union[int, float]) -> str:
411452
return {
412-
ArgType.TENSOR: tensor_arg,
453+
ArgType.TENSOR: tensor_arg_annotate,
413454
ArgType.INT_TENSOR: tensor_arg,
414455
ArgType.LONG_TENSOR: tensor_arg,
415456
ArgType.PLACEHOLDER_TENSOR: tensor_arg,

0 commit comments

Comments
 (0)