Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ struct device_info {
bool supports_image; ///< Does engine support images (CL_DEVICE_IMAGE_SUPPORT cap).
bool supports_intel_planar_yuv; ///< Does engine support cl_intel_planar_yuv extension.
bool supports_work_group_collective_functions; ///< Does engine support CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT.
bool supports_non_uniform_work_group; ///< Does engine support non-uniform work-group sizes.

bool supports_imad; ///< Does engine support int8 mad.
bool supports_immad; ///< Does engine support int8 multi mad.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.supports_intel_required_subgroup_size = device_info.supports_intel_required_subgroup_size;
params.engineInfo.supports_image = device_info.supports_image;
params.engineInfo.supports_work_group_collective_functions = device_info.supports_work_group_collective_functions;
params.engineInfo.supports_non_uniform_work_group = device_info.supports_non_uniform_work_group;

params.engineInfo.supports_imad = device_info.supports_imad;
params.engineInfo.supports_immad = device_info.supports_immad;
Expand Down
13 changes: 8 additions & 5 deletions src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ std::string toString(const kernel_selector::CommonDispatchData& dispatchData) {
}

void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData,
const size_t maxWorkGroupSize) {
const EngineInfo& engineInfo) {
const auto maxWorkGroupSize = engineInfo.maxWorkGroupSize;
if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + ": " +
": LWS and GWS size is expected to be equal to 3. Actual: " +
Expand All @@ -42,10 +43,12 @@ void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_s
": Dispatch data cannot contain zeros. Actual: " +
toString(dispatchData));

if (dispatchData.gws[i] % dispatchData.lws[i] != 0)
throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
": GWS must be divisible by corresponding LWS. Actual: " +
toString(dispatchData));
if (!engineInfo.supports_non_uniform_work_group) {
if (dispatchData.gws[i] % dispatchData.lws[i] != 0)
throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
": GWS must be divisible by corresponding LWS. Actual: " +
toString(dispatchData));
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/kernel_selector/kernel_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class KernelBase {
const std::string kernelName;

static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData,
const size_t maxWorkGroupSize);
const EngineInfo& engineInfo);
virtual Datatype GetUnitType(const base_params& params) const;

bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel,
int number_of_outputs,
bool is_dynamic) const {
if (!is_dynamic && !kernel.skip_execution)
KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info);
kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ struct EngineInfo {
bool bOptHintsSupport = false;
bool supports_microkernels = false;
bool supports_work_group_collective_functions = false;
bool supports_non_uniform_work_group = false;
uint32_t vendor_id = 0x0;
dev_type deviceType = dev_type::integrated_gpu;
uint32_t computeUnitsCount = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ KernelsData AdaptivePoolingRef::GetKernelsData(const Params& params) const {
const auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

auto& kernel = kd.kernels[0];
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ KernelsData DeformableConvolutionKernel_bfyx_opt::GetKernelsData(const Params& p

auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[i];
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params) const

auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[i];
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void ExperimentalDetectronDetectionOutputKernelRef::PrepareKernelCommon(
cldnn_jit.AddConstant(MakeJitConstant(stage_name, "true"));

const auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo);
kernel.params.workGroups.global = dispatch_data.gws;
kernel.params.workGroups.local = dispatch_data.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ KernelsData ExperimentalDetectronGenerateProposalsSingleImageRef::GetKernelsData
const auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[i];

KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ KernelsData GenerateProposalsRef::GetKernelsData(const Params& params) const {
const auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[i];

KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params) const {

DispatchData dispatch_data = SetDefault(new_params, i);
auto& kernel = kernel_data.kernels[i];
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo);
kernel.params.workGroups.global = dispatch_data.gws;
kernel.params.workGroups.local = dispatch_data.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ KernelsData MulticlassNmsKernelRef::GetKernelsData(const Params& params) const {
cldnn_jit.AddConstant(MakeJitConstant("MULTICLASSNMS_STAGE_" + std::to_string(i), "true"));

const auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo);
auto& kernel = kd.kernels[i];

kernel.params.workGroups.global = dispatch_data.gws;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params) con

auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[i];
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size
const bool has_static_wzp = m_fc->get_input_size() > 4 && optional_w_zp->get_output_partial_shape(0).rank().is_static();
const bool is_wei_i8_u8 = cldnn::one_of(m_fc->get_input_element_type(1), {ov::element::i8, ov::element::u8});

if (is_wei_i8_u8 && use_gs128_for_int8_per_token && adj_group_size == UINT64_MAX) {
if (DynamicQuantizeFullyConnected::ShouldUseGs128(is_wei_i8_u8, use_gs128_for_int8_per_token, adj_group_size)) {
adj_group_size = 128;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("DynamicQuantizeFullyConnected");
DynamicQuantizeFullyConnected(uint64_t group_size, bool asymmetric = false, bool precompute_sum = true, bool use_gs128_for_int8_per_token = false);
static bool ShouldUseGs128(uint64_t is_wei_i8u8, bool use_gs128_for_int8_per_token, uint64_t group_size) {
return (is_wei_i8u8 && use_gs128_for_int8_per_token && group_size == UINT64_MAX);
}
};

} // namespace ov::intel_gpu
31 changes: 26 additions & 5 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1275,17 +1275,29 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::SinkReshape>();

if (device_info.supports_immad && config.get_use_onednn()) {
bool asymmetric_dyn_quant = config.get_asym_dynamic_quantization();
const bool asymmetric_dyn_quant = config.get_asym_dynamic_quantization();
auto dynamic_quantization_group_size = config.get_dynamic_quantization_group_size();
auto dynamic_quantization_group_size_max = config.get_dynamic_quantization_group_size_max();
bool precomputed_reduction = config.get_dynamic_quantization_precomputed_reduction();
const bool precomputed_reduction = config.get_dynamic_quantization_precomputed_reduction();

const bool group_dyn_quan_allowed = m_context->get_engine().get_device_info().supports_non_uniform_work_group;
// WA: when platform does not support non-uniform-work-group, it may fail to run dynamic quantization for gs128.
// This is unlikely to happen. But this WA is added just in case.
const bool use_gs128_for_int8_per_token = m_context->get_engine().get_device_info().arch >= cldnn::gpu_arch::xe2
&& group_dyn_quan_allowed;

pass_config->set_callback<ov::intel_gpu::DynamicQuantizeFullyConnected>([=](const_node_ptr& root) -> bool {
for (size_t i = 0 ; i < root->get_input_node_shared_ptr(0)->get_output_size(); ++i) {
if (root->get_input_node_shared_ptr(0)->get_output_element_type(i) == ov::element::Type_t::f32) {
GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: input type is not supported" << std::endl;
return true;
}
}
uint64_t adj_group_size = dynamic_quantization_group_size;
const bool is_wei_i8u8 = cldnn::one_of(root->get_input_element_type(1), {ov::element::i8, ov::element::u8});
if (ov::intel_gpu::DynamicQuantizeFullyConnected::ShouldUseGs128(is_wei_i8u8, use_gs128_for_int8_per_token, adj_group_size)) {
adj_group_size = 128;
}

const auto& input_shape = root->get_input_partial_shape(0);
const size_t input_rank = input_shape.size();
Expand All @@ -1305,7 +1317,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
}

// AZP does not support grouped size dyn-quan
GPU_DEBUG_IF(asymmetric_dyn_quant && (dynamic_quantization_group_size != UINT64_MAX)) {
GPU_DEBUG_IF(asymmetric_dyn_quant && (adj_group_size != UINT64_MAX)) {
GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: asym quantization does not support grouped quantization" <<
" ('DynamicQuantizeAsym' is enabled with grouped size dyn-quan)" << std::endl;
return true;
Expand All @@ -1319,13 +1331,22 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
return true;
}

const bool is_grouped = adj_group_size != UINT64_MAX;
// It should be either per-token or hardware should support grouped dyn_quan(through non-uniform-work-group)
if (is_grouped && !group_dyn_quan_allowed) {
GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off:"
" group_dyn_quan_allowed " << group_dyn_quan_allowed << std::endl;
return true;
}

return false;
});
if (dynamic_quantization_group_size_max < dynamic_quantization_group_size) {

const bool model_allows_group_size = dynamic_quantization_group_size_max >= dynamic_quantization_group_size;
if (!model_allows_group_size) {
GPU_DEBUG_INFO << "dyn_quan is turned off because group_size is larger than max size "
<< dynamic_quantization_group_size << "/" << dynamic_quantization_group_size_max << std::endl;
} else {
const bool use_gs128_for_int8_per_token = m_context->get_engine().get_device_info().arch >= cldnn::gpu_arch::xe_hpg;
manager.register_pass<ov::intel_gpu::DynamicQuantizeFullyConnected>(dynamic_quantization_group_size,
asymmetric_dyn_quant,
precomputed_reduction,
Expand Down
5 changes: 4 additions & 1 deletion src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,14 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
// These flags are supported from OPENCL_300: CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, CL_DEVICE_OPENCL_C_FEATURES
// OpenCL C3.0: work_group_<ops> are optional. It should be checked 'work group collective functions' are supported in OpenCL C 3.0.
info.supports_work_group_collective_functions = device.getInfo<CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT>();
info.supports_non_uniform_work_group = device.getInfo<CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT>();
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
// OpenCL C2.0: work_group_<ops> are mandetory.
// OpenCL C2.0: work_group_<ops> are mandatory.
info.supports_work_group_collective_functions = true;
info.supports_non_uniform_work_group = true;
#else
info.supports_work_group_collective_functions = false;
info.supports_non_uniform_work_group = false;
#endif

if (info.supports_intel_required_subgroup_size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enum class TestForSmallInputs { No, Yes }; // Test very small inputs to validat
enum class PrecomputeSum { Disabled, Enabled };
class dynamic_quantization_gpu_tests: public ::testing::Test {
public:
std::string dyn_quan_kernel_id = "";
void test_dynamic_quantization(bool is_caching_test,
const ov::PartialShape& input_shape,
const ov::Shape& data_shape,
Expand Down Expand Up @@ -188,9 +189,29 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], abs_error_threshold);
}
}

auto find_kernel_id = [&network](std::string prim_id) {
std::string kernel = "";
for (auto& info : network->get_primitives_info()) {
if (info.original_id == prim_id)
kernel = info.kernel_id;
}
return kernel;
};

dyn_quan_kernel_id = find_kernel_id("dyn_quan_prim");
}
};

TEST_F(dynamic_quantization_gpu_tests, static_quantizing_large_size_non_uniform_workgroup) {
// if non_uniform_workgroup is not supported, it will run on dyn_quan_ref
// if non_uniform_workgroup is supported, it will run on dyn_quan_opt
this->test_dynamic_quantization(false, {11, 1, 4096+128}, {2048, 1, 4096+128}, QuantizationType::Symmetric, 128);
if (get_test_engine().get_device_info().supports_non_uniform_work_group) {
ASSERT_TRUE(dyn_quan_kernel_id.find("_opt") != std::string::npos);
}
}

TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) {
this->test_dynamic_quantization(false, {11, 1, 1, 4096}, {2048, 1, 1, 4096});
}
Expand Down
Loading