diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 5cee2d22e5fd51..416a72c1a527aa 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -111,6 +111,7 @@ struct device_info { bool supports_image; ///< Does engine support images (CL_DEVICE_IMAGE_SUPPORT cap). bool supports_intel_planar_yuv; ///< Does engine support cl_intel_planar_yuv extension. bool supports_work_group_collective_functions; ///< Does engine support CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT. + bool supports_non_uniform_work_group; ///< Does engine support non-uniform work-group sizes. bool supports_imad; ///< Does engine support int8 mad. bool supports_immad; ///< Does engine support int8 multi mad. diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 5c746a85c34c76..9833b750caf06c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -1160,6 +1160,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p params.engineInfo.supports_intel_required_subgroup_size = device_info.supports_intel_required_subgroup_size; params.engineInfo.supports_image = device_info.supports_image; params.engineInfo.supports_work_group_collective_functions = device_info.supports_work_group_collective_functions; + params.engineInfo.supports_non_uniform_work_group = device_info.supports_non_uniform_work_group; params.engineInfo.supports_imad = device_info.supports_imad; params.engineInfo.supports_immad = device_info.supports_immad; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp index a7afa8e68c145a..3ecc996c4eb5b3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp @@ -25,7 +25,8 @@ std::string toString(const kernel_selector::CommonDispatchData& dispatchData) { } void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData, - const size_t maxWorkGroupSize) { + const EngineInfo& engineInfo) { + const auto maxWorkGroupSize = engineInfo.maxWorkGroupSize; if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3) throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + ": " + ": LWS and GWS size is expected to be equal to 3. Actual: " + @@ -42,10 +43,12 @@ void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_s ": Dispatch data cannot contain zeros. Actual: " + toString(dispatchData)); - if (dispatchData.gws[i] % dispatchData.lws[i] != 0) - throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + - ": GWS must be divisible by corresponding LWS. Actual: " + - toString(dispatchData)); + if (!engineInfo.supports_non_uniform_work_group) { + if (dispatchData.gws[i] % dispatchData.lws[i] != 0) + throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + + ": GWS must be divisible by corresponding LWS. Actual: " + + toString(dispatchData)); + } } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_base.h index ea32ec39ae905b..6991b001cd6230 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base.h @@ -63,7 +63,7 @@ class KernelBase { const std::string kernelName; static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData, - const size_t maxWorkGroupSize); + const EngineInfo& engineInfo); virtual Datatype GetUnitType(const base_params& params) const; bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp index f66e7eeae8cf39..c9ba15a619f2af 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp @@ -215,7 +215,7 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel, int number_of_outputs, bool is_dynamic) const { if (!is_dynamic && !kernel.skip_execution) - KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info); kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h index c0381d6e1e321d..8da70f59248ed0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h @@ -390,6 +390,7 @@ struct EngineInfo { bool bOptHintsSupport = false; bool supports_microkernels = false; bool supports_work_group_collective_functions = false; + bool supports_non_uniform_work_group = false; uint32_t vendor_id = 0x0; dev_type deviceType = dev_type::integrated_gpu; uint32_t computeUnitsCount = 0; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/adaptive_pooling/adaptive_pooling_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/adaptive_pooling/adaptive_pooling_kernel_ref.cpp index 3db6e7d2d9c556..d310c55d9b12e3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/adaptive_pooling/adaptive_pooling_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/adaptive_pooling/adaptive_pooling_kernel_ref.cpp @@ -91,7 +91,7 @@ KernelsData AdaptivePoolingRef::GetKernelsData(const Params& params) const { const auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_opt.cpp index cbd7b4023f2bad..50e6c11b2214a8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_opt.cpp @@ -156,7 +156,7 @@ KernelsData DeformableConvolutionKernel_bfyx_opt::GetKernelsData(const Params& p auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/detection_output/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/detection_output/detection_output_kernel_ref.cpp index f1b4087927bb88..c6fa9e3537f7c1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/detection_output/detection_output_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/detection_output/detection_output_kernel_ref.cpp @@ -331,7 +331,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params) const auto jit = CreateJit(kernelName, cldnnJit, entryPoint); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp index 5e6f58be7120a2..749ebb07734496 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp @@ -103,7 +103,7 @@ void ExperimentalDetectronDetectionOutputKernelRef::PrepareKernelCommon( cldnn_jit.AddConstant(MakeJitConstant(stage_name, "true")); const auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo); kernel.params.workGroups.global = dispatch_data.gws; kernel.params.workGroups.local = dispatch_data.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_gpsi/generate_proposals_single_image_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_gpsi/generate_proposals_single_image_kernel_ref.cpp index 270709238a7d38..254888659f51a0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_gpsi/generate_proposals_single_image_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_gpsi/generate_proposals_single_image_kernel_ref.cpp @@ -164,7 +164,7 @@ KernelsData ExperimentalDetectronGenerateProposalsSingleImageRef::GetKernelsData const auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/generate_proposals/generate_proposals_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/generate_proposals/generate_proposals_kernel_ref.cpp index 3c98e22e3d77e5..0afb078cf9245c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/generate_proposals/generate_proposals_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/generate_proposals/generate_proposals_kernel_ref.cpp @@ -181,7 +181,7 @@ KernelsData GenerateProposalsRef::GetKernelsData(const Params& params) const { const auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp index 026b61675f66de..fd62b28e97b7e2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp @@ -105,7 +105,7 @@ KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params) const { DispatchData dispatch_data = SetDefault(new_params, i); auto& kernel = kernel_data.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo); kernel.params.workGroups.global = dispatch_data.gws; kernel.params.workGroups.local = dispatch_data.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/multiclass_nms/multiclass_nms_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/multiclass_nms/multiclass_nms_kernel_ref.cpp index ef92740f853ddc..c2b3ce5a0a8835 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/multiclass_nms/multiclass_nms_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/multiclass_nms/multiclass_nms_kernel_ref.cpp @@ -152,7 +152,7 @@ KernelsData MulticlassNmsKernelRef::GetKernelsData(const Params& params) const { cldnn_jit.AddConstant(MakeJitConstant("MULTICLASSNMS_STAGE_" + std::to_string(i), "true")); const auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo); auto& kernel = kd.kernels[i]; kernel.params.workGroups.global = dispatch_data.gws; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp index 84471046d815db..f7dabb816c2b13 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp @@ -284,7 +284,7 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params) con auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 5be0c6862cdc1e..b51a1019150e19 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -51,7 +51,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size const bool has_static_wzp = m_fc->get_input_size() > 4 && optional_w_zp->get_output_partial_shape(0).rank().is_static(); const bool is_wei_i8_u8 = cldnn::one_of(m_fc->get_input_element_type(1), {ov::element::i8, ov::element::u8}); - if (is_wei_i8_u8 && use_gs128_for_int8_per_token && adj_group_size == UINT64_MAX) { + if (DynamicQuantizeFullyConnected::ShouldUseGs128(is_wei_i8_u8, use_gs128_for_int8_per_token, adj_group_size)) { adj_group_size = 128; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp index 93c11e256621bb..b69552b82e35f1 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.hpp @@ -12,6 +12,9 @@ class DynamicQuantizeFullyConnected: public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("DynamicQuantizeFullyConnected"); DynamicQuantizeFullyConnected(uint64_t group_size, bool asymmetric = false, bool precompute_sum = true, bool use_gs128_for_int8_per_token = false); + static bool ShouldUseGs128(uint64_t is_wei_i8u8, bool use_gs128_for_int8_per_token, uint64_t group_size) { + return (is_wei_i8u8 && use_gs128_for_int8_per_token && group_size == UINT64_MAX); + } }; } // namespace ov::intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 67dd08f0539d15..c5d02548355b5e 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -1275,10 +1275,17 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); if (device_info.supports_immad && config.get_use_onednn()) { - bool asymmetric_dyn_quant = config.get_asym_dynamic_quantization(); + const bool asymmetric_dyn_quant = config.get_asym_dynamic_quantization(); auto dynamic_quantization_group_size = config.get_dynamic_quantization_group_size(); auto dynamic_quantization_group_size_max = config.get_dynamic_quantization_group_size_max(); - bool precomputed_reduction = config.get_dynamic_quantization_precomputed_reduction(); + const bool precomputed_reduction = config.get_dynamic_quantization_precomputed_reduction(); + + const bool group_dyn_quan_allowed = m_context->get_engine().get_device_info().supports_non_uniform_work_group; + // WA: when platform does not support non-uniform-work-group, it may fail to run dynamic quantization for gs128. + // This is unlikely to happen. But this WA is added just in case. + const bool use_gs128_for_int8_per_token = m_context->get_engine().get_device_info().arch >= cldnn::gpu_arch::xe2 + && group_dyn_quan_allowed; + pass_config->set_callback([=](const_node_ptr& root) -> bool { for (size_t i = 0 ; i < root->get_input_node_shared_ptr(0)->get_output_size(); ++i) { if (root->get_input_node_shared_ptr(0)->get_output_element_type(i) == ov::element::Type_t::f32) { @@ -1286,6 +1293,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return true; } } + uint64_t adj_group_size = dynamic_quantization_group_size; + const bool is_wei_i8u8 = cldnn::one_of(root->get_input_element_type(1), {ov::element::i8, ov::element::u8}); + if (ov::intel_gpu::DynamicQuantizeFullyConnected::ShouldUseGs128(is_wei_i8u8, use_gs128_for_int8_per_token, adj_group_size)) { + adj_group_size = 128; + } const auto& input_shape = root->get_input_partial_shape(0); const size_t input_rank = input_shape.size(); @@ -1305,7 +1317,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } // AZP does not support grouped size dyn-quan - GPU_DEBUG_IF(asymmetric_dyn_quant && (dynamic_quantization_group_size != UINT64_MAX)) { + GPU_DEBUG_IF(asymmetric_dyn_quant && (adj_group_size != UINT64_MAX)) { GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: asym quantization does not support grouped quantization" << " ('DynamicQuantizeAsym' is enabled with grouped size dyn-quan)" << std::endl; return true; @@ -1319,13 +1331,22 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return true; } + const bool is_grouped = adj_group_size != UINT64_MAX; + // It should be either per-token or hardware should support grouped dyn_quan(through non-uniform-work-group) + if (is_grouped && !group_dyn_quan_allowed) { + GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off:" + " group_dyn_quan_allowed " << group_dyn_quan_allowed << std::endl; + return true; + } + return false; }); - if (dynamic_quantization_group_size_max < dynamic_quantization_group_size) { + + const bool model_allows_group_size = dynamic_quantization_group_size_max >= dynamic_quantization_group_size; + if (!model_allows_group_size) { GPU_DEBUG_INFO << "dyn_quan is turned off because group_size is larger than max size " << dynamic_quantization_group_size << "/" << dynamic_quantization_group_size_max << std::endl; } else { - const bool use_gs128_for_int8_per_token = m_context->get_engine().get_device_info().arch >= cldnn::gpu_arch::xe_hpg; manager.register_pass(dynamic_quantization_group_size, asymmetric_dyn_quant, precomputed_reduction, diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 7eee5dc8800265..ca0193f85e1d3c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -259,11 +259,14 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex // These flags are supported from OPENCL_300: CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, CL_DEVICE_OPENCL_C_FEATURES // OpenCL C3.0: work_group_ are optional. It should be checked 'work group collective functions' are supported in OpenCL C 3.0. info.supports_work_group_collective_functions = device.getInfo(); + info.supports_non_uniform_work_group = device.getInfo(); #elif CL_HPP_TARGET_OPENCL_VERSION >= 200 - // OpenCL C2.0: work_group_ are mandetory. + // OpenCL C2.0: work_group_ are mandatory. info.supports_work_group_collective_functions = true; + info.supports_non_uniform_work_group = true; #else info.supports_work_group_collective_functions = false; + info.supports_non_uniform_work_group = false; #endif if (info.supports_intel_required_subgroup_size) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index cba59edc1ea517..bd160624bc6ec8 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -30,6 +30,7 @@ enum class TestForSmallInputs { No, Yes }; // Test very small inputs to validat enum class PrecomputeSum { Disabled, Enabled }; class dynamic_quantization_gpu_tests: public ::testing::Test { public: + std::string dyn_quan_kernel_id = ""; void test_dynamic_quantization(bool is_caching_test, const ov::PartialShape& input_shape, const ov::Shape& data_shape, @@ -188,9 +189,29 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], abs_error_threshold); } } + + auto find_kernel_id = [&network](std::string prim_id) { + std::string kernel = ""; + for (auto& info : network->get_primitives_info()) { + if (info.original_id == prim_id) + kernel = info.kernel_id; + } + return kernel; + }; + + dyn_quan_kernel_id = find_kernel_id("dyn_quan_prim"); } }; +TEST_F(dynamic_quantization_gpu_tests, static_quantizing_large_size_non_uniform_workgroup) { + // if non_uniform_workgroup is not supported, it will run on dyn_quan_ref + // if non_uniform_workgroup is supported, it will run on dyn_quan_opt + this->test_dynamic_quantization(false, {11, 1, 4096+128}, {2048, 1, 4096+128}, QuantizationType::Symmetric, 128); + if (get_test_engine().get_device_info().supports_non_uniform_work_group) { + ASSERT_TRUE(dyn_quan_kernel_id.find("_opt") != std::string::npos); + } +} + TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_large_size) { this->test_dynamic_quantization(false, {11, 1, 1, 4096}, {2048, 1, 1, 4096}); }