aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu/cl/kernels')
-rw-r--r--src/gpu/cl/kernels/ClActivationKernel.cpp276
-rw-r--r--src/gpu/cl/kernels/ClActivationKernel.h76
-rw-r--r--src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp164
-rw-r--r--src/gpu/cl/kernels/ClBatchConcatenateKernel.h75
-rw-r--r--src/gpu/cl/kernels/ClCastKernel.cpp172
-rw-r--r--src/gpu/cl/kernels/ClCastKernel.h83
-rw-r--r--src/gpu/cl/kernels/ClCol2ImKernel.cpp193
-rw-r--r--src/gpu/cl/kernels/ClCol2ImKernel.h95
-rw-r--r--src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp132
-rw-r--r--src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h80
-rw-r--r--src/gpu/cl/kernels/ClCopyKernel.cpp182
-rw-r--r--src/gpu/cl/kernels/ClCopyKernel.h72
-rw-r--r--src/gpu/cl/kernels/ClCropKernel.cpp155
-rw-r--r--src/gpu/cl/kernels/ClCropKernel.h89
-rw-r--r--src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp150
-rw-r--r--src/gpu/cl/kernels/ClDepthConcatenateKernel.h75
-rw-r--r--src/gpu/cl/kernels/ClDequantizeKernel.cpp162
-rw-r--r--src/gpu/cl/kernels/ClDequantizeKernel.h64
-rw-r--r--src/gpu/cl/kernels/ClDirectConv2dKernel.cpp542
-rw-r--r--src/gpu/cl/kernels/ClDirectConv2dKernel.h107
-rw-r--r--src/gpu/cl/kernels/ClDirectConv3dKernel.cpp282
-rw-r--r--src/gpu/cl/kernels/ClDirectConv3dKernel.h97
-rw-r--r--src/gpu/cl/kernels/ClElementwiseKernel.cpp603
-rw-r--r--src/gpu/cl/kernels/ClElementwiseKernel.h238
-rw-r--r--src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp192
-rw-r--r--src/gpu/cl/kernels/ClElementwiseUnaryKernel.h68
-rw-r--r--src/gpu/cl/kernels/ClFillKernel.cpp125
-rw-r--r--src/gpu/cl/kernels/ClFillKernel.h71
-rw-r--r--src/gpu/cl/kernels/ClFloorKernel.cpp126
-rw-r--r--src/gpu/cl/kernels/ClFloorKernel.h64
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp364
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h91
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp327
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h100
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp609
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h114
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp530
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h107
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp238
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h96
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp307
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h107
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp183
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h85
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp183
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h87
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp185
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h87
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp246
-rw-r--r--src/gpu/cl/kernels/ClGemmLowpReductionKernel.h136
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp471
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h104
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp447
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h128
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp473
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h121
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp408
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h102
-rw-r--r--src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp216
-rw-r--r--src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h82
-rw-r--r--src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp192
-rw-r--r--src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h87
-rw-r--r--src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp143
-rw-r--r--src/gpu/cl/kernels/ClHeightConcatenateKernel.h73
-rw-r--r--src/gpu/cl/kernels/ClIm2ColKernel.cpp488
-rw-r--r--src/gpu/cl/kernels/ClIm2ColKernel.h117
-rw-r--r--src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp172
-rw-r--r--src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h87
-rw-r--r--src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp335
-rw-r--r--src/gpu/cl/kernels/ClIndirectConv2dKernel.h97
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp262
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h84
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp263
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h78
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.cpp284
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.h85
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp235
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h104
-rw-r--r--src/gpu/cl/kernels/ClMulKernel.cpp484
-rw-r--r--src/gpu/cl/kernels/ClMulKernel.h138
-rw-r--r--src/gpu/cl/kernels/ClPermuteKernel.cpp157
-rw-r--r--src/gpu/cl/kernels/ClPermuteKernel.h76
-rw-r--r--src/gpu/cl/kernels/ClPool2dKernel.cpp398
-rw-r--r--src/gpu/cl/kernels/ClPool2dKernel.h80
-rw-r--r--src/gpu/cl/kernels/ClPool3dKernel.cpp275
-rw-r--r--src/gpu/cl/kernels/ClPool3dKernel.h76
-rw-r--r--src/gpu/cl/kernels/ClQuantizeKernel.cpp184
-rw-r--r--src/gpu/cl/kernels/ClQuantizeKernel.h69
-rw-r--r--src/gpu/cl/kernels/ClReshapeKernel.cpp126
-rw-r--r--src/gpu/cl/kernels/ClReshapeKernel.h64
-rw-r--r--src/gpu/cl/kernels/ClScaleKernel.cpp271
-rw-r--r--src/gpu/cl/kernels/ClScaleKernel.h72
-rw-r--r--src/gpu/cl/kernels/ClScatterKernel.cpp250
-rw-r--r--src/gpu/cl/kernels/ClScatterKernel.h77
-rw-r--r--src/gpu/cl/kernels/ClSoftmaxKernel.cpp263
-rw-r--r--src/gpu/cl/kernels/ClSoftmaxKernel.h85
-rw-r--r--src/gpu/cl/kernels/ClTransposeKernel.cpp151
-rw-r--r--src/gpu/cl/kernels/ClTransposeKernel.h64
-rw-r--r--src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp290
-rw-r--r--src/gpu/cl/kernels/ClTransposedConvolutionKernel.h73
-rw-r--r--src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp180
-rw-r--r--src/gpu/cl/kernels/ClWeightsReshapeKernel.h98
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp173
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h71
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp216
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h85
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp140
-rw-r--r--src/gpu/cl/kernels/ClWidthConcatenateKernel.h72
-rw-r--r--src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp188
-rw-r--r--src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h84
-rw-r--r--src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp320
-rw-r--r--src/gpu/cl/kernels/ClWinogradInputTransformKernel.h93
-rw-r--r--src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp356
-rw-r--r--src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h98
-rw-r--r--src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp226
-rw-r--r--src/gpu/cl/kernels/gemm/ClGemmHelpers.h144
-rw-r--r--src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h123
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp256
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h69
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp74
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h59
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp172
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h63
-rw-r--r--src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h72
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp375
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h73
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp538
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h67
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h70
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp577
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h81
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp755
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h73
-rw-r--r--src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h70
-rw-r--r--src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp108
-rw-r--r--src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h72
136 files changed, 24744 insertions, 0 deletions
diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
new file mode 100644
index 0000000000..a85296f7cd
--- /dev/null
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM16, DataType::F16, DataType::F32);
+
+ static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+ ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+ ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+ };
+ const DataType data_type = src->data_type();
+ const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+ const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+ (quantized_supported_activations.count(f_act) == 0),
+ "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+ "lower/upper bounded relu are supported");
+
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+ // Checks performed when destination is configured
+ if ((dst != nullptr) && (dst->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClActivationKernel::ClActivationKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+
+ auto padding_info = get_padding_info({src, dst});
+
+ _run_in_place = (dst == nullptr) || (dst == src);
+
+ if (dst != nullptr)
+ {
+ // Destination auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, *src->clone());
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info));
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+ const DataType dt = src->data_type();
+ float a_const = act_info.a();
+ float b_const = act_info.b();
+
+ const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+ const bool is_quantized = is_data_type_quantized(dt);
+ const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+ (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+ (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+ build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+
+ std::string kernel_name = std::string("activation_layer");
+
+ // Set quantization info build options
+ if (is_quantized)
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+
+ if (!perform_activation_in_float)
+ {
+ int a_const_int = 0;
+ int b_const_int = 0;
+
+ // Create quantized version of constants a, b if needed
+ switch (dt)
+ {
+ case DataType::QASYMM8:
+ {
+ a_const_int = quantize_qasymm8(a_const, iq_info);
+ b_const_int = quantize_qasymm8(b_const, iq_info);
+ }
+ break;
+ case DataType::QASYMM8_SIGNED:
+ {
+ a_const_int = quantize_qasymm8_signed(a_const, iq_info);
+ b_const_int = quantize_qasymm8_signed(b_const, iq_info);
+ }
+ break;
+ case DataType::QSYMM16:
+ {
+ a_const_int = quantize_qsymm16(a_const, iq_info);
+ b_const_int = quantize_qsymm16(b_const, iq_info);
+ }
+ break;
+ default:
+ break;
+ }
+ build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+ build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+ }
+ else
+ {
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+ }
+
+ // Quantized value of 0 corresponds to the offset o1
+ build_opts.add_option(
+ ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+ build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
+ build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+ "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+
+ // Set correct kernel name
+ kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
+
+ // Set scale and offset of the source and destination if they have different quantization info
+ if (dst != nullptr)
+ {
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ if (iq_info != oq_info)
+ {
+ build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
+ build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+ "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+ }
+ }
+ }
+ else
+ {
+ // Set A, B constants in build options for float types
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+ }
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "activation_layer_";
+ _config_id += lower_string(string_from_data_type(dt));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
+ return Status{};
+}
+
+void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ if (!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, dst, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
new file mode 100644
index 0000000000..ab7607bb82
--- /dev/null
+++ b/src/gpu/cl/kernels/ClActivationKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
+#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the activation kernel. */
+class ClActivationKernel : public IClKernel
+{
+public:
+ ClActivationKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel);
+ /** Configure kernel for a given list of arguments
+ *
+ * @note If the output tensor is a nullptr, the activation function will be performed in-place
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
+ * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src
+ * @param[in] act_info Activation layer information.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ ActivationLayerInfo act_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClActivationKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ bool _run_in_place{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
new file mode 100644
index 0000000000..a853f6bc1b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
+
+ return Status{};
+}
+} // namespace
+
+ClBatchConcatenateKernel::ClBatchConcatenateKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ unsigned int batch_offset,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ _batch_offset = batch_offset;
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+
+ std::string kernel_name = "concatenate";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+ win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_";
+ _config_id += support::cpp11::to_string(3);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(batch_offset);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(3));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+ unsigned int batch_offset,
+ const arm_compute::ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
+ return Status{};
+}
+
+void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window slice = window.first_slice_window_3D();
+
+ const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+ _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
new file mode 100644
index 0000000000..549576b628
--- /dev/null
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the batch concatenate kernel.
+ * The src tensor will be concatenated into the destination tensor.
+ */
+class ClBatchConcatenateKernel : public IClKernel
+{
+public:
+ ClBatchConcatenateKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
+ /** Initialise the kernel's source and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: All.
+ * @param[in] batch_offset The offset on axis # 3.
+ * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
+ *
+ * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+ * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+ *
+ */
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClBatchConcatenateKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ unsigned int _batch_offset{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
new file mode 100644
index 0000000000..bbffcf55a3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32, DataType::S64, DataType::U64);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
+
+ // Validate in case of configured dst
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClCastKernel::ClCastKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*dst, src->tensor_shape());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ // Get data sizes
+ const size_t src_size = data_size_from_type(src->data_type());
+ const size_t dst_size = data_size_from_type(dst->data_type());
+
+ // Get number of elements to process per iterations
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
+ // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+ build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
+ build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+ "-DIS_DATA_TYPE_FLOAT");
+ build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
+
+ // Create kernel
+ const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ // Collapse window
+ const Window &full_window = window();
+ Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+ ICLKernel::configure_internal(collapsed_window);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(src->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+}
+
+Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
+ return Status{};
+}
+
+void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
new file mode 100644
index 0000000000..07b0b61443
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H
+#define ARM_COMPUTE_CL_CAST_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Casts a given tensor to a new type
+ *
+ * @note When casting between quantized types the scale and zeroPoint are ignored
+ */
+class ClCastKernel : public IClKernel
+{
+public:
+ ClCastKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel);
+ /** Set the src and dst of the kernel.
+ *
+ * Valid conversions src -> dst :
+ *
+ * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
+ * - U8 -> S8, U16, S16, U32, S32, F16, F32
+ * - S8 -> U8, U16, S16, U32, S32, F16, F32
+ * - U16 -> U8, S8, S16, U32, S32, F16, F32
+ * - S16 -> U8, S8, U16, U32, S32, F16, F32
+ * - U32 -> U8, S8, U16, S16, S32, F16, F32
+ * - S64 -> U8, S8, U16, S16, U32, S32, F16, F32
+ * - U64 -> U8, S8, U16, S16, U32, S32, F16, F32
+ * - S32 -> U8, S8, U16, S16, U32, F16, F32
+ * - F16 -> U8, S8, U16, S16, U32, S32, F32
+ * - F32 -> U8, S8, U16, S16, U32, S32, F16
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/U64/S64/F16/F32.
+ * @param[out] dst The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+ * @param[in] policy Conversion policy
+ */
+ void
+ configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCastKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
new file mode 100644
index 0000000000..9972e07f05
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &convolved_dims,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+
+ // Checks performed when output is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+ "Col2Im output's data layout must always be NCHW");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()
+ ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+ .set_data_layout(DataLayout::NCHW));
+
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+
+ // Configure window
+ Window win = calculate_max_window(*src, Steps(num_elems_read_per_iteration));
+
+ // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
+ AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &convolved_dims,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims, num_groups));
+
+ _convolved_dims = convolved_dims;
+
+ const DataType data_type = src->data_type();
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
+ build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(src->dimension(0)));
+ build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
+ build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+
+ _kernel = create_kernel(compile_context, "col2im", build_opts.options());
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src, dst, _convolved_dims, num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "col2im_";
+ _config_id += lower_string(string_from_data_type(src->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(num_groups);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+}
+
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &convolved_dims,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+ return Status{};
+}
+
+void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+
+ bool is_collapsed = false;
+ bool is_collapsed_out = false;
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window out_window;
+ out_window.use_tensor_dimensions(dst->info()->tensor_shape());
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
+ Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
+
+ ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_out = collapsed_out.first_slice_window_4D();
+ do
+ {
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_4D_tensor_argument(idx, dst, slice_out);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
new file mode 100644
index 0000000000..34194aba01
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COL2IM_KERNEL_H
+#define ARM_COMPUTE_CL_COL2IM_KERNEL_H
+
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref opencl::kernels::ClIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClCol2ImKernel : public IClKernel
+{
+public:
+ /** Default constructor */
+ ClCol2ImKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCol2ImKernel);
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The input tensor info to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM],
+ * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
+ * @param[in] convolved_dims Output convolved dimensions.
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &convolved_dims,
+ unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClCol2ImKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ Size2D _convolved_dims;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_COL2IM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000000..85d3c3939c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto initialisation if not yet initialized
+ auto_init_if_empty(*dst, *src->clone());
+
+ auto padding_info = get_padding_info({src, dst});
+
+ ARM_COMPUTE_ERROR_THROW_ON(
+ ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+
+ const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
+
+ const int width_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL);
+
+ const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx];
+ const unsigned int num_channels = original_src_shape[channel_idx];
+
+ const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels;
+ const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane;
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+ build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
+ build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps());
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+
+void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src, window);
+ add_2D_tensor_argument(idx, dst, window);
+ enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
new file mode 100644
index 0000000000..0ddb54561a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
+ *
+ * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
+ * - It follows a Convolution layer
+ * - The data layout used by the network does not match the one the model has been trained in.
+ *
+ * @note This function assumes the weights are already reshaped (transposed)
+ */
+namespace opencl
+{
+namespace kernels
+{
+class ClConvertFullyConnectedWeightsKernel : public IClKernel
+{
+public:
+ ClConvertFullyConnectedWeightsKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel);
+ /** Set the src and dst tensor.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
+ * @param[out] dst The converted weights tensor info. Shape and Data Type: Same as @p src.
+ * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+ * @param[in] data_layout The data layout the weights have been trained in.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
new file mode 100644
index 0000000000..c80ef664f5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+ // Validate dst if initialized
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ if (dst_window == nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape());
+ }
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+ClCopyKernel::ClCopyKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ Window *dst_window)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, *src);
+
+ // Configure window
+ const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+ const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
+
+ if (dst_window != nullptr)
+ {
+ _has_dst_window = true;
+ _dst_window = Window(*dst_window);
+ const int width_x = dst_window->num_iterations(0);
+ const int vec_size_x_leftover = width_x % vec_size_x;
+ const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x);
+
+ if (multi_access_x)
+ {
+ _dst_window.set(Window::DimX,
+ Window::Dimension(dst_window->x().start(),
+ ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+ }
+
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+ }
+ else
+ {
+ const int width_x = src->tensor_shape().x();
+ const int vec_size_x_leftover = width_x % vec_size_x;
+
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+ }
+
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+
+ // Build kernel
+ _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
+
+ // Validate and set the window
+ ICLKernel::configure_internal(win_config);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
+
+ return Status{};
+}
+
+void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window slice;
+
+ if (_has_dst_window)
+ {
+ slice = window.first_slice_window_3D();
+ Window out_slice = _dst_window.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, out_slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+ }
+ else
+ {
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ slice = collapsed.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+ }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
new file mode 100644
index 0000000000..f915bf672d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCopyKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H
+#define ARM_COMPUTE_CL_COPY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a copy between two tensors */
+class ClCopyKernel : public IClKernel
+{
+public:
+ ClCopyKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel);
+ /** Initialize the kernel's src, dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: All.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ Window *dst_window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCopyKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ Window _dst_window{};
+ bool _has_dst_window{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COPY_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
new file mode 100644
index 0000000000..0c503e13fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCropKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClCropKernel::ClCropKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
+
+ _start = start;
+ _batch_index = batch_index;
+ _extrapolation_value = extrapolation_value;
+
+ const uint32_t vec_size_x = 4;
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*dst);
+
+ if (dst_window != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
+ win = *dst_window;
+ }
+
+ const uint32_t dst_width_x = win.num_iterations(0);
+ const bool multi_access_x = dst_width_x >= vec_size_x;
+ const bool remainder_x = dst_width_x % vec_size_x > 0;
+
+ if (multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x && remainder_x,
+ "-DLAST_ACCESSED_X=" +
+ support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
+ build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
+ _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
+}
+
+Status ClCropKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
+{
+ ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+ end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
+ if (dst_window != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
+ }
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3);
+ }
+ return Status{};
+}
+
+void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window in_slice = Window();
+ in_slice.use_tensor_dimensions(src->info()->tensor_shape());
+ in_slice.set(Window::DimX,
+ Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+ window.x().step()));
+ in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
+
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, in_slice);
+ add_3D_tensor_argument(idx, dst, window);
+ add_argument(idx, _start.x);
+ add_argument(idx, _start.y);
+ enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
new file mode 100644
index 0000000000..506262608c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCropKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_KERNEL_H
+#define ARM_COMPUTE_CL_CROP_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a copy between two tensors */
+class ClCropKernel : public IClKernel
+{
+public:
+ ClCropKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel);
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+ * @param[out] dst Destination tensor info. Data type supported: F32
+ * @param[in] start Coordinates of where to start cropping the image.
+ * @param[in] end Coordinates of where to end cropping the image.
+ * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src.
+ * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
+ * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value = 0,
+ Window *dst_window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCropKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value = 0,
+ Window *dst_window = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ Coordinates2D _start{};
+ uint32_t _batch_index{};
+ float _extrapolation_value{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CROP_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..ec44d88f01
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
+
+ return Status{};
+}
+} // namespace
+
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ unsigned int depth_offset,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ _depth_offset = depth_offset;
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+
+ std::string kernel_name = "concatenate";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+ win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+ unsigned int depth_offset,
+ const arm_compute::ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
+ return Status{};
+}
+
+void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window slice = window.first_slice_window_3D();
+
+ const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+ _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
new file mode 100644
index 0000000000..539f010303
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the depth concatenate kernel.
+ * The src tensor will be concatenated into the dst tensor.
+ */
+class ClDepthConcatenateKernel : public IClKernel
+{
+public:
+ ClDepthConcatenateKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
+ /** Initialise the kernel's source and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] depth_offset The offset on the Z axis.
+ * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
+ *
+ * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+ * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+ *
+ */
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClDepthConcatenateKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ unsigned int _depth_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
new file mode 100644
index 0000000000..53429ab1aa
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+ DataType::QSYMM16);
+
+ if (dst->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClDequantizeKernel::ClDequantizeKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
+
+ auto padding_info = get_padding_info({src, dst});
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+ const int vec_size_x = 16 / dst->element_size();
+ const int output_width_x = dst->tensor_shape().x();
+ const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
+ std::string kernel_name = "dequantization_layer";
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ if (!is_quantized_per_channel)
+ {
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+ const int qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
+ }
+ else
+ {
+ kernel_name += "_per_channel";
+ kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
+ }
+
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+ std::max<int>(output_width_x - vec_size_x, 0)));
+
+ // Create kernel name
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst);
+ if (multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+ return Status{};
+}
+
+void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
+
+ // Collapse windo
+ Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+ : window.collapse_if_possible(ICLKernel::window(), 3);
+ Window slice = new_window.first_slice_window_3D();
+
+ if (is_quantized_per_channel)
+ {
+ unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
+ _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (new_window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.h b/src/gpu/cl/kernels/ClDequantizeKernel.h
new file mode 100644
index 0000000000..a32f506c9a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the dequantization layer kernel. */
+class ClDequantizeKernel : public IClKernel
+{
+public:
+ ClDequantizeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
+ /** Initialise the kernel's input and output
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+ * @param[out] dst Destination tensor info. Data types supported: F16/F32.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClDequantizeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
new file mode 100644
index 0000000000..7cf1958c1b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+
+ const DataLayout data_layout = src->data_layout();
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+ "Export to CLImage is not supported for the input tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+ "Export to CLImage is not supported for the output tensor");
+
+ if (data_layout == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+ "Weights should have same width and height");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+ "Strides larger than 3 not supported for 1x1 convolution.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+ weights->dimension(width_idx) == 9) &&
+ std::get<0>(conv_info.stride()) > 2,
+ "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
+
+ if (is_data_type_quantized(src->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+ weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+ "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+ weights->dimension(width_idx) != 5,
+ "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+ }
+ }
+
+ if (data_layout == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+ "Fused activation in NHWC is only supported for floating point.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+ "M0 can only be greater than 0 and less than or equal to 8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+ desc.n0 != 16,
+ "N0 can only be: 1, 2, 3, 4, 8, and 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+ desc.k0 != 16,
+ "K0 can only be: 1, 2, 3, 4, 8, and 16");
+ if (desc.export_weights_to_cl_image)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+ "K0 can only be: 4, 8, and 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
+ "Export to CLImage is not supported for this weight configuration");
+ }
+ }
+
+ if (biases != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ }
+
+ const auto data_type = src->data_type();
+ if (is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ }
+ return Status{};
+}
+} // namespace
+
+ClDirectConv2dKernel::ClDirectConv2dKernel()
+{
+ _type = CLKernelType::DIRECT;
+}
+
+void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+ // Perform validation
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
+
+ const int conv_stride_x = std::get<0>(conv_info.stride());
+ const int conv_stride_y = std::get<1>(conv_info.stride());
+
+ _data_layout = src->data_layout();
+ _conv_info = conv_info;
+
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int kernel_size = weights->dimension(width_idx);
+ const DataType data_type = src->data_type();
+
+ const GPUTarget gpu_target = get_target();
+ unsigned int _num_elems_processed_per_iteration = 0;
+
+ // Get dst shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
+
+ // Configure kernel window
+ Window win;
+ if (_data_layout == DataLayout::NHWC)
+ {
+ output_shape.collapse(2U, 1U);
+ const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
+ const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]);
+
+ // Create window and update padding
+ win = calculate_max_window(output_shape, Steps(n0, m0));
+ }
+ else if (_data_layout == DataLayout::NCHW)
+ {
+ _num_elems_processed_per_iteration = 1u;
+ win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+ }
+
+ ICLKernel::configure_internal(win);
+
+ std::stringstream kernel_name;
+ CLBuildOptions build_options;
+
+ if (_data_layout == DataLayout::NHWC)
+ {
+ kernel_name << "direct_convolution_nhwc";
+
+ const unsigned int n0 = win.x().step();
+ const unsigned int m0 = win.y().step();
+ const unsigned int k0 = adjust_vec_size(desc.k0, src->dimension(channel_idx));
+ const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
+ const unsigned int pad_left = conv_info.pad_left();
+ const unsigned int pad_top = conv_info.pad_top();
+
+ _export_weights_to_cl_image = desc.export_weights_to_cl_image;
+ _export_input_to_cl_image = desc.export_input_to_cl_image;
+ _export_output_to_cl_image = desc.export_output_to_cl_image;
+
+ // Update the padding for the weights tensor if we can export to cl_image
+ if (_export_weights_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(weights);
+ }
+
+ if (_export_output_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(dst);
+ }
+
+ if (_export_input_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(src);
+ }
+
+ if (biases != nullptr)
+ {
+ build_options.add_option(std::string("-DHAS_BIAS"));
+ build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+ }
+
+ // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+ const auto act_function = act_info.activation();
+ const auto dst_data_type = dst->data_type();
+
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_options.add_option("-cl-unsafe-math-optimizations");
+ }
+ else
+ {
+ build_options.add_option("-cl-fast-relaxed-math");
+ }
+
+ build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+ "-DSRC_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
+ build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
+ build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(2)));
+ build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
+ build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
+ build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
+ build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+ "-DDST_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+ build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+ "-DWEI_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+ build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+ build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
+ build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+ build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+ build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+ build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+ build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
+ build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+
+ if (is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+ PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+ int zero_value_s32;
+ zero_value.get(zero_value_s32);
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_options.add_option("-DIS_QUANTIZED");
+ build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+ }
+ else
+ {
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+ build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+ build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+ build_options.add_option_if(act_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_options.add_option_if(act_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+ }
+
+ if (compile_context.get_ddk_version() >= 30)
+ {
+ build_options.add_option("-fregister-allocation=64");
+ }
+ }
+ else
+ {
+ _export_weights_to_cl_image = false;
+
+ kernel_name << "direct_convolution_nchw";
+ build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
+ build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+ build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+ build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+ build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+ build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+ build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+ build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+ build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+ build_options.add_option(
+ std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+ build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
+ build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+ build_options.add_option(
+ std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+ build_options.add_option(
+ std::string("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_options.add_option("-DIS_QUANTIZED");
+ build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+ build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ }
+ }
+
+ _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+ // Set config_id for enabling LWS tuning
+ // config_id should include the variables used to parameterize the kernel
+ _config_id = kernel_name.str();
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_size);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(border_size().left);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(border_size().top);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(border_size().right);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(border_size().bottom);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_stride_y);
+ // SRC_CHANNELS, SRC_WIDTH, SRC_HEIGHT
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(channel_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(width_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(height_idx));
+ _config_id += "_";
+ // DST_CHANNELS, DST_WIDTH, DST_HEIGHT
+ _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(width_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(height_idx));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(_data_layout));
+}
+
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
+ return Status{};
+}
+
+void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto weights =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto biases =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ if (_data_layout == DataLayout::NHWC)
+ {
+ cl::Image2D weights_cl_image;
+ cl::Image2D output_cl_image;
+ cl::Image2D input_cl_image;
+
+ if (_export_weights_to_cl_image)
+ {
+ // Export tensor to cl_image
+ weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
+ }
+
+ if (_export_output_to_cl_image)
+ {
+ // Export tensor to cl_image
+ output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
+ }
+
+ if (_export_input_to_cl_image)
+ {
+ // Export tensor to cl_image
+ input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
+ }
+
+ unsigned int idx = 0;
+ if (_export_input_to_cl_image)
+ {
+ _kernel.setArg(idx++, input_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, src);
+ if (_export_output_to_cl_image)
+ {
+ _kernel.setArg(idx++, output_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, dst);
+ if (_export_weights_to_cl_image)
+ {
+ _kernel.setArg(idx++, weights_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, weights);
+ if (biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, biases, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ else
+ {
+ unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+ add_3D_tensor_argument(idx1, weights, slice);
+
+ if (biases != nullptr)
+ {
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
+ add_1D_tensor_argument(idx1, biases, slice_biases);
+ }
+
+ _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+ }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
new file mode 100644
index 0000000000..c934c825ca
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the direct convolution kernel. */
+class ClDirectConv2dKernel : public IClKernel
+{
+public:
+ ClDirectConv2dKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
+ /** Set the src, weights, biases and dst tensors info.
+ *
+ * @note: Due to set_valid_region() in NCHW, src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
+ *
+ * @note: DirectConvolution only works in the following configurations for the NCHW data layout:
+ * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+ * 3x3 convolution with stride_x = 1/2, stride_y = 1/2
+ * 5x5 convolution with stride_x = 1/2, stride_y = 1/2
+ * 9x9 convolution with stride_x = 1/2, stride_y = 1/2
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * The 3rd dimension must be the same as the src's volume 3rd dimension.
+ * Data type supported:Same as @p src.
+ * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM].
+ * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ * @param[out] dst Output tensor info.
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo.
+ * @param[in] desc Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClDirectConv2dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ DataLayout _data_layout{};
+ PadStrideInfo _conv_info{};
+ bool _export_weights_to_cl_image{false};
+ bool _export_output_to_cl_image{false};
+ bool _export_input_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
new file mode 100644
index 0000000000..8002520a87
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+
+ // When fusing activation, same workaround introduced for COMPMID-5324 may be necessary
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+ (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+ (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+ (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+
+ if (src2 != nullptr)
+ {
+ if (is_data_type_quantized(src0->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(),
+ misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClDirectConv3dKernel::ClDirectConv3dKernel()
+{
+ _type = CLKernelType::DIRECT;
+}
+
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ // Perform validation
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, conv3d_info));
+
+ // Create window and update padding
+ const DataType data_type = src0->data_type();
+ const size_t src_width = src0->dimension(1);
+ const size_t src_height = src0->dimension(2);
+ const size_t src_depth = src0->dimension(3);
+ const size_t src_channels = src0->dimension(0);
+ const size_t dst_width = dst->dimension(1);
+ const size_t dst_height = dst->dimension(2);
+ const size_t dst_depth = dst->dimension(3);
+ const size_t dst_channels = dst->dimension(0);
+ const size_t weights_width = src1->dimension(2);
+ const size_t weights_height = src1->dimension(3);
+ const size_t weights_depth = src1->dimension(4);
+ const size_t pad_left = conv3d_info.padding.left;
+ const size_t pad_top = conv3d_info.padding.top;
+ const size_t pad_front = conv3d_info.padding.front;
+ const size_t conv_stride_x = conv3d_info.stride.x();
+ const size_t conv_stride_y = conv3d_info.stride.y();
+ const size_t conv_stride_z = conv3d_info.stride.z();
+
+ const size_t n0 = std::min(dst->dimension(0), static_cast<size_t>(4u));
+ const size_t m0 = (dst->tensor_shape()[0] > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
+ const size_t k0 = adjust_vec_size(8u, src0->dimension(0));
+ const size_t partial_store_n0 = dst->dimension(0) % n0;
+
+ CLBuildOptions build_options;
+ build_options.add_option("-cl-fast-relaxed-math");
+ build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+ build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+ build_options.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src_depth));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channels));
+ build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
+ build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
+ build_options.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst_depth));
+ build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
+ build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights_width));
+ build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights_height));
+ build_options.add_option("-DWEI_DEPTH=" + support::cpp11::to_string(weights_depth));
+ build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+ build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+ build_options.add_option("-DSTRIDE_Z=" + support::cpp11::to_string(conv_stride_z));
+ build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+ build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+ build_options.add_option("-DPAD_FRONT=" + support::cpp11::to_string(pad_front));
+ build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+ if (src2 != nullptr)
+ {
+ build_options.add_option(std::string("-DHAS_BIAS"));
+ build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
+ }
+
+ if (is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+ PixelValue zero_value = PixelValue(0, src0->data_type(), src0->quantization_info());
+ int zero_value_s32;
+ zero_value.get(zero_value_s32);
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_options.add_option("-DIS_QUANTIZED");
+ build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+ }
+ else
+ {
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::F32));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+ build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+ build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+ }
+
+ std::string kernel_name = "direct_convolution3d_ndhwc";
+ _kernel = create_kernel(compile_context, kernel_name, build_options.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(weights_width);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(weights_height);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(weights_depth);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_stride_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_stride_z);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst_width);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst_height);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst_channels);
+}
+
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
+ return Status{};
+}
+
+void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto weights =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto biases =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+ slice.set(Window::DimY, Window::Dimension(0,
+ ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+ dst->info()->dimension(3),
+ slice.y().step()),
+ slice.y().step()));
+ slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, slice);
+ add_4D_tensor_argument(idx, dst, slice);
+ add_4D_tensor_argument(idx, weights, slice);
+ if (biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, biases, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
new file mode 100644
index 0000000000..cb7509d8fa
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H
+
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+struct Conv3dInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the direct convolution 3d kernel. */
+class ClDirectConv3dKernel : public IClKernel
+{
+public:
+ /** Construtor */
+ ClDirectConv3dKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClDirectConv3dKernel(const ClDirectConv3dKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClDirectConv3dKernel &operator=(const ClDirectConv3dKernel &) = delete;
+ /** Default move constructor */
+ ClDirectConv3dKernel(ClDirectConv3dKernel &&) = default;
+ /** Default move assignment operator */
+ ClDirectConv3dKernel &operator=(ClDirectConv3dKernel &&) = default;
+ /** Set the src, weights, biases and dst tensors info.
+ *
+ * Valid data layouts:
+ * - NDHWC
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+ * while every optional dimension from 5 and above represent a batch of srcs.
+ * @param[in] src1 Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+ * @param[in] src2 Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * @param[out] dst Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+ * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClDirectConv3dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
new file mode 100644
index 0000000000..cdb3527a92
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Validate.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+constexpr unsigned int vector_size_byte_opencl = 16;
+
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+ {ArithmeticOperation::ADD, "ADD"}, {ArithmeticOperation::SUB, "SUB"},
+ {ArithmeticOperation::DIV, "DIV"}, {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+ {ArithmeticOperation::MIN, "MIN"}, {ArithmeticOperation::MAX, "MAX"},
+ {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
+};
+
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+ {ArithmeticOperation::ADD, "ADD"},
+ {ArithmeticOperation::SUB, "SUB"},
+};
+
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+ std::string config_id;
+ // Set config_id for enabling LWS tuning
+ config_id = kernel_name;
+ config_id += "_";
+ config_id += lower_string(string_from_data_type(src1.data_type()));
+ config_id += "_";
+ config_id += support::cpp11::to_string(dst.dimension(0));
+ config_id += "_";
+ config_id += support::cpp11::to_string(dst.dimension(1));
+ return config_id;
+}
+
+Status validate_in_place_output_shape(const bool in_place,
+ const bool src1_in_place,
+ const ITensorInfo &src1,
+ const ITensorInfo &src2,
+ const ITensorInfo &dst,
+ const TensorShape &out_shape)
+{
+ if (in_place)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+ "Wrong shape for dst, cannot do in_place calculation");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+ "Wrong shape for dst");
+ }
+ return Status{};
+}
+
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+ const ITensorInfo &src2,
+ const ITensorInfo &dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
+
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured dst
+ if (dst.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+ }
+
+ return Status{};
+}
+
+Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured dst
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+ }
+
+ return Status{};
+}
+
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+ DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
+
+ if (is_data_type_quantized_symmetric(src1.data_type()))
+ {
+ const int32_t in1_offset = src1.quantization_info().uniform().offset;
+ const int32_t in2_offset = src2.quantization_info().uniform().offset;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
+ }
+
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured dst
+ if (dst.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+ "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+
+ if (is_data_type_quantized_symmetric(dst.data_type()))
+ {
+ const int32_t offset = dst.quantization_info().uniform().offset;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
+ }
+ }
+ return Status{};
+}
+
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+ const ITensorInfo &src2,
+ const ITensorInfo &dst,
+ const std::string &operation_string)
+{
+ CLBuildOptions build_opts;
+
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
+ build_opts.add_option("-DVEC_SIZE_IN1=" +
+ support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_IN2=" +
+ support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DOP=" + operation_string);
+ if (is_data_type_quantized(src1.data_type()))
+ {
+ const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
+ const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst.quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset));
+ build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+ }
+ build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32");
+
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+ build_opts.add_option_if(in_place, "-DIN_PLACE");
+ build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
+ return build_opts;
+}
+
+std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
+{
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+ Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+ return std::make_pair(Status{}, win);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+ const TensorShape &out_shape = broadcast_pair.first;
+
+ auto_init_if_empty(dst, out_shape, 1, src1.data_type());
+
+ return configure_window_arithmetic_common(dst);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+ const TensorShape &out_shape = broadcast_pair.first;
+
+ set_shape_if_empty(dst, out_shape);
+ set_data_type_if_unknown(dst, DataType::U8);
+
+ return configure_window_arithmetic_common(dst);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+ const TensorShape &out_shape = broadcast_pair.first;
+
+ auto_init_if_empty(dst, out_shape, 1, src1.data_type());
+
+ return configure_window_arithmetic_common(dst);
+}
+} // namespace
+
+ClElementwiseKernel::ClElementwiseKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst)
+{
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*src1, *src2, *dst);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ std::string kernel_name = "elementwise_operation_" + name();
+ if (is_data_type_quantized(src1->data_type()))
+ {
+ kernel_name += "_quantized";
+ }
+
+ // Set kernel build options
+ CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
+ if (_act_info.enabled())
+ {
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b()));
+ }
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ ICLKernel::configure_internal(win_config.second);
+
+ _config_id = generate_id_for_tuning(kernel_name, *src1, *dst);
+}
+
+void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src_0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src_1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
+ const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+ const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+ const TensorShape &out_shape = dst->info()->tensor_shape();
+
+ bool can_collapse = true;
+ const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src_0 == dst) || (src_1 == dst);
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src_0, slice_src1);
+ add_3D_tensor_argument(idx, src_1, slice_src2);
+ if (!in_place)
+ {
+ add_3D_tensor_argument(idx, dst, slice);
+ }
+
+ enqueue(queue, *this, slice, lws_hint());
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+/** Logical binary */
+
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+ LogicalOperation op,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
+ _op = op;
+ configure_common(compile_context, src1, src2, dst);
+}
+
+Status ClLogicalBinaryKernel::validate(LogicalOperation op,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+ .first);
+
+ return Status{};
+}
+
+std::string ClLogicalBinaryKernel::name()
+{
+ switch (_op)
+ {
+ case LogicalOperation::And:
+ return "AND";
+ case LogicalOperation::Or:
+ return "OR";
+ case LogicalOperation::Not:
+ /* fall through */
+ default:
+ ARM_COMPUTE_ASSERT(true);
+ }
+ return "";
+}
+
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+ return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
+}
+
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+ // The arithmetic utility functions can be share
+ return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
+}
+
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+ const ITensorInfo &src1,
+ const ITensorInfo &dst)
+{
+ return generate_id_for_tuning_common(kernel_name, src1, dst);
+}
+
+/** Arithmetic operations with saturation*/
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context,
+ ArithmeticOperation op,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output,
+ const ConvertPolicy &policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
+ auto padding_info = get_padding_info({input1, input2, output});
+
+ _policy = policy;
+ _op = op;
+ _act_info = act_info;
+ configure_common(compile_context, input1, input2, output);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op,
+ const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ConvertPolicy &policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(op, policy);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+ .first);
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
+
+ return Status{};
+}
+
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+ ITensorInfo &input2,
+ ITensorInfo &output)
+{
+ return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+}
+
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+ const ITensorInfo &input2,
+ const ITensorInfo &output)
+{
+ const bool has_float_out = is_data_type_float(output.data_type());
+ auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+ build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ return build_options;
+}
+
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+ const ITensorInfo &input1,
+ const ITensorInfo &output)
+{
+ auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
+ config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+ config_id += lower_string(string_from_data_layout(input1.data_layout()));
+ return config_id;
+}
+
+std::string ClSaturatedArithmeticKernel::name()
+{
+ return supported_sat_arithmetic_ops[_op];
+}
+
+/** Arithmetic operations*/
+void ClArithmeticKernel::configure(const ClCompileContext &compile_context,
+ ArithmeticOperation op,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
+ auto padding_info = get_padding_info({src1, src2, dst});
+
+ _op = op;
+ _act_info = act_info;
+ configure_common(compile_context, src1, src2, dst);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClArithmeticKernel::validate(ArithmeticOperation op,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+ if (op == ArithmeticOperation::DIV)
+ {
+ // Partial integer support S32/F32/F16
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+ }
+ else if (op == ArithmeticOperation::POWER)
+ {
+ // Power operators doesn't support integer arithmetic
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+ .first);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+ return Status{};
+}
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+ if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+ {
+ // Division and Power operators don't support integer arithmetic
+ return validate_and_configure_window_for_division(src1, src2, dst);
+ }
+ else
+ {
+ return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst);
+ }
+}
+
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+ return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
+}
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+ const ITensorInfo &src1,
+ const ITensorInfo &dst)
+{
+ return generate_id_for_tuning_common(kernel_name, src1, dst);
+}
+
+std::string ClArithmeticKernel::name()
+{
+ return supported_arithmetic_ops[_op];
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
new file mode 100644
index 0000000000..73e54542b2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f]
+ *
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
+ */
+class ClElementwiseKernel : public IClKernel
+{
+public:
+ ClElementwiseKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+protected:
+ /** The name of the operation */
+ virtual std::string name() = 0;
+
+ /** Configure kernel for a given list of arguments
+ *
+ * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
+ *
+ * @return a pair of Status and Window
+ */
+ virtual std::pair<Status, Window>
+ validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+
+ /** Generate the build options for the specific kernel
+ *
+ * @reutrn a CLBuildOptions struct
+ */
+ virtual CLBuildOptions
+ generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+
+ /** Generate the identifier for tuning
+ *
+ * @reutrn a string
+ */
+ virtual std::string
+ generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+
+ /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
+ *
+ */
+ void
+ configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+
+ ActivationLayerInfo _act_info{};
+};
+
+class ClLogicalBinaryKernel : public ClElementwiseKernel
+{
+public:
+ ClLogicalBinaryKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel);
+ /** Function to configure kernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] op Logical binary operation to be executed.
+ * @param[in] src1 First source tensor info. Data types supported: U8.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
+ */
+ void configure(const ClCompileContext &compile_context,
+ LogicalOperation op,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClLogicalBinaryKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+private:
+ // Inherited methods overridden:
+ std::string name() override;
+ std::pair<Status, Window>
+ validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+ CLBuildOptions
+ generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+ std::string
+ generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+ LogicalOperation _op{LogicalOperation::Unknown};
+};
+
+/** Addition operation */
+class ClSaturatedArithmeticKernel : public ClElementwiseKernel
+{
+public:
+ ClSaturatedArithmeticKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel);
+ /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] op Arithmetic operation to be executed.
+ * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+ * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ * @param[in] policy Policy to use to handle overflow.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ArithmeticOperation op,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output,
+ const ConvertPolicy &policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClSaturatedArithmeticKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(ArithmeticOperation op,
+ const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ConvertPolicy &policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+ // Inherited methods overridden:
+ std::string name() override;
+ std::pair<Status, Window>
+ validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+ CLBuildOptions
+ generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+ std::string generate_id_for_tuning(const std::string &kernel_name,
+ const ITensorInfo &input1,
+ const ITensorInfo &output) override;
+
+private:
+ ConvertPolicy _policy{};
+ ArithmeticOperation _op{};
+};
+
+class ClArithmeticKernel : public ClElementwiseKernel
+{
+public:
+ ClArithmeticKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] op Arithmetic operation to be executed.
+ * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ArithmeticOperation op,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClArithmeticKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(ArithmeticOperation op,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+ // Inherited methods overridden:
+ std::string name() override;
+ std::pair<Status, Window>
+ validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+ CLBuildOptions
+ generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+ std::string
+ generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+private:
+ ArithmeticOperation _op{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
new file mode 100644
index 0000000000..f7c198ee54
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+constexpr unsigned int vector_size_byte_opencl = 16;
+
+Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
+ if (op == ElementWiseUnary::LOGICAL_NOT)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
+ }
+ else if (op == ElementWiseUnary::NEG)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
+ }
+ else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
+ }
+
+ // Validate in case of configured dst
+ if (dst.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const ElementWiseUnary &op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ auto padding_info = get_padding_info({src, dst});
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+
+ std::string kernel_name = "elementwise_unary";
+ const int vec_size_x = num_elems_processed_per_iteration;
+ const int dst_width_x = dst->dimension(0);
+ if (is_data_type_quantized(src->data_type()))
+ {
+ kernel_name += "_quantized";
+ }
+ // Set kernel build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+ if (is_data_type_quantized(src->data_type()))
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+ build_opts.add_option("-DOFFSET_IN=" + support::cpp11::to_string(iqinfo.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
+ build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+ }
+ switch (op)
+ {
+ case ElementWiseUnary::RSQRT:
+ build_opts.add_option("-DOPERATION=rsqrt_op");
+ break;
+ case ElementWiseUnary::EXP:
+ build_opts.add_option("-DOPERATION=exp_op");
+ break;
+ case ElementWiseUnary::NEG:
+ build_opts.add_option("-DOPERATION=neg_op");
+ break;
+ case ElementWiseUnary::SIN:
+ build_opts.add_option("-DOPERATION=sin_op");
+ break;
+ case ElementWiseUnary::ABS:
+ build_opts.add_option("-DOPERATION=fabs_op");
+ break;
+ case ElementWiseUnary::LOG:
+ build_opts.add_option("-DOPERATION=natural_log_op");
+ break;
+ case ElementWiseUnary::ROUND:
+ build_opts.add_option("-DOPERATION=round_op");
+ break;
+ case ElementWiseUnary::LOGICAL_NOT:
+ build_opts.add_option("-DOPERATION=logical_not_op");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst);
+ win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op));
+
+ return Status{};
+}
+
+void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
new file mode 100644
index 0000000000..81721f8ca8
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the elementwise unary operator */
+class ClElementWiseUnaryKernel : public IClKernel
+{
+public:
+ ClElementWiseUnaryKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel);
+ /** Initialise the kernel's srcs, dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src First source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] op Element wise unary operation to perform.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const ElementWiseUnary &op);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementWiseUnaryKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
new file mode 100644
index 0000000000..96ad503730
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClFillKernel::ClFillKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ const PixelValue &constant_value,
+ Window *window)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
+
+ const DataType data_type = tensor->data_type();
+ const int vec_size_x = 16 / tensor->element_size();
+
+ // Create and update the window (if needed)
+ _full_window = calculate_max_window(*tensor);
+ Window win = _full_window;
+ if (window != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+ win = *window;
+ }
+
+ const int output_width_x = win.num_iterations(0);
+ const bool multi_access_x = output_width_x >= vec_size_x;
+ const bool remainder_x = output_width_x % vec_size_x > 0;
+
+ if (multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x && remainder_x,
+ "-DLAST_ACCESSED_X=" +
+ support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ _kernel = create_kernel(compile_context, "memset", build_opts.options());
+}
+
+Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
+{
+ ARM_COMPUTE_UNUSED(tensor);
+ ARM_COMPUTE_UNUSED(constant_value);
+ if (window != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+ }
+ return Status{};
+}
+
+void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto tensor =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+
+ // Collapse all the batches on the third
+ Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, tensor, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
new file mode 100644
index 0000000000..5d69fbfbd1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFillKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H
+#define ARM_COMPUTE_CL_FILL_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for filling the planes of a tensor */
+class ClFillKernel : public IClKernel
+{
+public:
+ ClFillKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel);
+ /** Initialise the kernel's tensor and filling value
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] tensor Input tensor info. Supported data types: All.
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ const PixelValue &constant_value,
+ Window *window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClFillKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ Window _full_window{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FILL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
new file mode 100644
index 0000000000..358e84012b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClFloorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ // Validate in case of configured output
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClFloorKernel::ClFloorKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Auto initialize output
+ auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+ auto padding_info = get_padding_info({src, dst});
+
+ const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+ const int vec_size_x_leftovers = src->dimension(0) % vec_size_x;
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, "floor_layer", build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(vec_size_x));
+ IClKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+ return Status{};
+}
+
+void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClFloorKernel.h b/src/gpu/cl/kernels/ClFloorKernel.h
new file mode 100644
index 0000000000..6e413340ba
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFloorKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H
+#define ARM_COMPUTE_CL_FLOOR_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a floor operation */
+class ClFloorKernel : public IClKernel
+{
+public:
+ ClFloorKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel);
+ /** Configure kernel for a given list of arguments
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data type supported: F16/F32.
+ * @param[out] dst Destination tensor info. Same as @p src
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClFloorKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
new file mode 100644
index 0000000000..e0d925dfb2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ if (src0->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+ DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ ARM_COMPUTE_UNUSED(m);
+ ARM_COMPUTE_UNUSED(n);
+ ARM_COMPUTE_UNUSED(k);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
+ if (gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+ }
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst =
+ dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info,
+ ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ bool reinterpret_dst_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ Window win{};
+ bool window_changed = false;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_dst_as_3d to be false.
+ if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
+ {
+ reinterpret_dst_as_3d = false;
+ }
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, src0->clone()
+ ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+ .set_data_type(DataType::S32));
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_dst_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ // RHS matrix still needs padding on the X
+ AccessWindowStatic src1_access(
+ src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+
+ window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+ // We still need padding on the X dimension for the RHS matrix
+ auto padding_info = get_padding_info({src0, dst});
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
+ if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_src0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+ // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+ // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+ const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+ // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+ // NOTE: This might have implications on heuristics and performance
+ const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ std::string kernel_name("gemmlowp_mm_native");
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+ _config_id += "_";
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+ dst->clone().get(), lhs_info, rhs_info, gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if (_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if (_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src0, slice);
+ add_2D_tensor_argument(idx, src1, slice_b);
+ add_2D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000000..4f87096158
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
+class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel
+{
+public:
+ ClGemmLowpMatrixMultiplyNativeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel);
+ /** Initialise the kernel's input and dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src1 Source tensor containing the RHS matrix. Data type supported: same as @p src0
+ * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32
+ * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread
+ * lhs_info.m0: 2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread
+ * rhs_info.n0: 2,3,4,8,16
+ * rhs_info.k0: same as lhs_info.k0
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_input_as_3d{false};
+ bool _reinterpret_output_as_3d{false};
+ bool _use_dummy_work_items{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000000..ddbc809cdd
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ TensorShape tensor_shape0{src0->tensor_shape()};
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 =
+ src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info,
+ ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+
+ TensorInfo tmp_info(*dst);
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+ Window win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _k = gemm_info.k();
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensionssrc0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0);
+
+ auto padding_info = get_padding_info({src0, src1, dst});
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+
+ const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+ const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+ build_opts.add_option_if(_reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+
+ std::string kernel_name("gemmlowp_mm_reshaped_");
+ kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+ kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+ _config_id += "_";
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+ dst->clone().get(), lhs_info, rhs_info, gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if (_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
+ const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src0, slice);
+ add_2D_tensor_argument(idx, src1, slice_b);
+ add_2D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000000..d7b785996f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
+ *
+ * @note The input matrices @p src0 and @p src1 must be reshaped through:
+ * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
+ * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel
+{
+public:
+ ClGemmLowpMatrixMultiplyReshapedKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel);
+ /** Initialise the kernel's input and dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+ * @param[in] src1 Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+ * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32
+ * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported:
+ * lhs_info.m0: 2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * lhs_info.transpose: false
+ * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported:
+ * rhs_info.n0: 2,3,4,8,16
+ * rhs_info.k0: same as lhs_info.k0
+ * rhs_info.transpose: true
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ *
+ * @note lhs_info.k0 must be equal to rhs_info.k0
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_output_as_3d{false};
+ unsigned int _k{1};
+ bool _use_dummy_work_items{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
new file mode 100644
index 0000000000..2f1f3b8df0
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ if (src0->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+ DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+
+ const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
+ const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+ const int m = gemm_info.m;
+ const int n = gemm_info.n;
+ const int k = gemm_info.k;
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+ if (gemm_info.reinterpret_input_as_3d)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ if (output_stage.type == GEMMLowpOutputStageType::NONE)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+ (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+ "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+ // Checks performed if the dst stage needs to be fused
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (gemm_info.a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if (gemm_info.b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if mm result is a 3D reinterpretation
+ const bool reinterpret_as_3d =
+ expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (expected_dst_shape[1] * expected_dst_shape[2]));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+ if (expected_dst_shape.num_dimensions() > 1)
+ {
+ const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ TensorShape collapsed_dst_shape(expected_dst_shape);
+ collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+ "vector_sum_row must have the same number of batches of dst tensor");
+
+ if (gemm_info.a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+ if (output_multipliers != nullptr && output_shifts != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+ if (output_stage.is_quantized_per_channel)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+ }
+ }
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ ITensorInfo *bias,
+ ITensorInfo *output_multipliers,
+ ITensorInfo *output_shifts,
+ ElementsProcessed &num_elements_processed)
+{
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ // dst tensor auto initialization if not yet initialized
+ const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+ if (output_stage.type != GEMMLowpOutputStageType::NONE)
+ {
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+ }
+ else
+ {
+ auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+ }
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
+ num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
+
+ win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out =
+ calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ if (gemm_info.a_offset != 0)
+ {
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
+ }
+ // No access window needed for vector_sum_row
+ ARM_COMPUTE_UNUSED(vector_sum_row);
+
+ if (bias != nullptr)
+ {
+ AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win_out, bias_access);
+ }
+
+ if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+ {
+ AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+ num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+ window_changed =
+ window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+ }
+ }
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ ITensorInfo *bias,
+ ITensorInfo *output_multipliers,
+ ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts));
+
+ auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
+ const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
+ const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+ const int32_t a_offset = gemm_info.a_offset;
+ const int32_t b_offset = gemm_info.b_offset;
+
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+ _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_src0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+ // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+ // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+ // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+ // NOTE: This might have implications on heuristics and performance
+ const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = internal_m % internal_m0;
+ const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+
+ std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ kernel_name += "_fused_output_stage_fixedpoint";
+ _fuse_output_stage = true;
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0 && vector_sum_col != nullptr)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+ // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
+ if (!_is_quantized_per_channel)
+ {
+ build_opts.add_option("-DRESULT_MULTIPLIER=" +
+ support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+ }
+ else
+ {
+ build_opts.add_option("-DRESULT_MULTIPLIER=0");
+ build_opts.add_option("-DRESULT_SHIFT=0");
+ }
+ build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+
+ const int min = output_stage.gemmlowp_min_bound;
+ const int max = output_stage.gemmlowp_max_bound;
+
+ PixelValue min_val{};
+ PixelValue max_val{};
+ std::tie(min_val, max_val) = get_min_max(dst->data_type());
+ build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ }
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+ _config_id += "_";
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+ vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ bias != nullptr ? bias->clone().get() : nullptr,
+ output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+ output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ const auto vector_sum_col =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+ const auto vector_sum_row =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+ const auto output_shifts =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+ const auto output_multipliers =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if (_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if (_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ // Set window for vector_sum_col
+ Window win_vector_sum_col = slice;
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row = slice;
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window biases_slice = slice;
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src0, slice);
+ add_2D_tensor_argument(idx, src1, slice_b);
+ add_2D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+ if (_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ idx++;
+ }
+
+ if (_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+ idx++;
+ }
+
+ if (_fuse_output_stage)
+ {
+ add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+ add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+ add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+ add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+ add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+ }
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
new file mode 100644
index 0000000000..1d4696b089
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel
+{
+public:
+ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel);
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+ * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+ * Only the following values are supported for LHS info:
+ * lhs_info.m0: 2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * Only the following values are supported for RHS info:
+ * rhs_info.n0: 2,3,4,8,16
+ * rhs_info.k0: same as lhs_info.k0
+ * rhs_info.transpose: true
+ * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+ * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+ * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+ * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+ * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+ * Supported data types: S32.
+ * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+ * Supported data types: S32.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col = nullptr,
+ const ITensorInfo *vector_sum_row = nullptr,
+ ITensorInfo *bias = nullptr,
+ ITensorInfo *output_multipliers = nullptr,
+ ITensorInfo *output_shifts = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col = nullptr,
+ const ITensorInfo *vector_sum_row = nullptr,
+ const ITensorInfo *bias = nullptr,
+ const ITensorInfo *output_multipliers = nullptr,
+ const ITensorInfo *output_shifts = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_input_as_3d{false};
+ bool _reinterpret_output_as_3d{false};
+ bool _use_dummy_work_items{false};
+ bool _is_quantized_per_channel{false};
+ bool _fuse_output_stage{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
new file mode 100644
index 0000000000..030c11d069
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+ "The extension cl_arm_matrix_multiply is not supported on the target platform");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+
+ const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
+ const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+ "Only 1,2,4 are supported for m0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+ "Only 1,4,8 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+ const int m = gemm_info.m;
+ const int n = gemm_info.n;
+ const int k = gemm_info.k;
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+ if (gemm_info.reinterpret_input_as_3d)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ if (output_stage.type == GEMMLowpOutputStageType::NONE)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+ (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+ "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+ // Checks performed if the dst stage needs to be fused
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (gemm_info.a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if (gemm_info.b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if mm result is a 3D reinterpretation
+ const bool reinterpret_as_3d =
+ expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (expected_dst_shape[1] * expected_dst_shape[2]));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+ if (expected_dst_shape.num_dimensions() > 1)
+ {
+ const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ TensorShape collapsed_dst_shape(expected_dst_shape);
+ collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+ "vector_sum_row must have the same number of batches of dst tensor");
+
+ if (gemm_info.a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+ if (output_multipliers != nullptr && output_shifts != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+ if (output_stage.is_quantized_per_channel)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+ }
+ }
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ ITensorInfo *bias,
+ ITensorInfo *output_multipliers,
+ ITensorInfo *output_shifts,
+ ElementsProcessed &num_elements_processed)
+{
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
+
+ Window win{};
+ bool window_changed = false;
+
+ constexpr unsigned int mmul_n0 = 4;
+ constexpr unsigned int mmul_m0 = 4;
+ constexpr unsigned int mmul_k0 = 16;
+
+ reinterpret_output_as_3d = false;
+ // dst tensor auto initialization if not yet initialized
+ const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+ if (output_stage.type != GEMMLowpOutputStageType::NONE)
+ {
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+ }
+ else
+ {
+ auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+ }
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = 1;
+ num_elems_processed_per_iteration_y = 1;
+
+ win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ if (gemm_info.a_offset != 0)
+ {
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+ }
+ // No access window needed for vector_sum_row
+ ARM_COMPUTE_UNUSED(vector_sum_row);
+
+ if (bias != nullptr)
+ {
+ AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+
+ if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+ {
+ AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+ num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+ window_changed =
+ window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+ }
+ }
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ Window collapsed = win.collapse(win, dimension_to_collapse);
+
+ // Reconfigure window size, one arm_matrix_multiply kernel needs 16 threads to finish.
+ Window::Dimension x_dimension = collapsed.x();
+ Window::Dimension y_dimension = collapsed.y();
+
+ // Make M and N multiple of M0 and N0 respectively
+ const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(x_dimension.end(), gemm_info.rhs_info.n0);
+ const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(y_dimension.end(), gemm_info.lhs_info.m0);
+
+ // Divide M and N by M0 and N0 respectively
+ const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / gemm_info.rhs_info.n0;
+ const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / gemm_info.lhs_info.m0;
+
+ // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_k0 respectively
+ const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+ const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_k0);
+
+ // Ensure x_dimension is multiple of MMUL block size (mmul_n0 * mmul_m0)
+ x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_n0);
+ y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+ collapsed.set(Window::DimX, x_dimension);
+ collapsed.set(Window::DimY, y_dimension);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ ITensorInfo *bias,
+ ITensorInfo *output_multipliers,
+ ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts));
+
+ auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
+ const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
+ const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
+ const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+ const int32_t a_offset = gemm_info.a_offset;
+ const int32_t b_offset = gemm_info.b_offset;
+ constexpr int mmul_m0 = 4;
+ constexpr int mmul_n0 = 4;
+ constexpr int mmul_k0 = 16;
+
+ _m = gemm_info.m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ const unsigned int m0_leftover = _m % lhs_info.m0;
+ const unsigned int n0_leftover = _n % rhs_info.n0;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option("-DVEC_TYPE=" + get_cl_type_from_data_type(src0->data_type()) + "4");
+ build_opts.add_option("-DACC_DATA_TYPE=int");
+ build_opts.add_option("-DOUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+ build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+ build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+ build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+ build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+ build_opts.add_option("-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+ std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
+
+ if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
+ _fuse_output_stage = true;
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0 && vector_sum_col != nullptr)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+ build_opts.add_option_if(gemm_info.broadcast_bias == true, "-DBROADCAST_BIAS");
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+ build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+
+ const int min = output_stage.gemmlowp_min_bound;
+ const int max = output_stage.gemmlowp_max_bound;
+
+ PixelValue min_val{};
+ PixelValue max_val{};
+ std::tie(min_val, max_val) = get_min_max(dst->data_type());
+ build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ }
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (bias != nullptr ? "add_bias_" : "");
+ _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+ _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+ _config_id += lower_string(string_from_data_type(src0->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_m);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_n);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+ output_multipliers, output_shifts));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+ vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ bias != nullptr ? bias->clone().get() : nullptr,
+ output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+ output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ const auto vector_sum_col =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+ const auto vector_sum_row =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ cl::Image2D src1_image2d;
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+
+ add_3d_tensor_nhw_argument(idx, src0);
+ add_3d_tensor_nhw_argument(idx, src1);
+
+ // Bias buffer (_add_bias == true)
+ if (src2 != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, src2);
+ }
+ // dst buffer
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ // Pass m, n and k at runtime as signed ints, to ensure results of any subtraction they could be operand in, would still be signed.
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ if (_fuse_output_stage)
+ {
+ if (vector_sum_col != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, vector_sum_col);
+ }
+ if (vector_sum_row != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, vector_sum_row);
+ }
+ }
+
+ enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
new file mode 100644
index 0000000000..fc8b73140d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data types when only the input matrix RHS (src1) has been reshaped using the MMUL instruction
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel : public IClKernel
+{
+public:
+ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel);
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+ * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+ * lhs_info.m0: 1,2,4
+ * Only the following values are supported for RHS info:
+ * rhs_info.n0: 1,4,8
+ * rhs_info.k0: same as lhs_info.k0: 4
+ * rhs_info.transpose: true
+ * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+ * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+ * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+ * @param[in] bias (Optional) Biases tensor. Can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM] or same dimensionality as dst if gemm_info.broadcast_bias is false. Data type supported: S32.
+ * @param[in] output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
+ * @param[in] output_shifts (Optional) Output shifts tensor. Supported data types: S32.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ ITensorInfo *vector_sum_col = nullptr,
+ const ITensorInfo *vector_sum_row = nullptr,
+ ITensorInfo *bias = nullptr,
+ ITensorInfo *output_multipliers = nullptr,
+ ITensorInfo *output_shifts = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ const GEMMKernelInfo &gemm_info,
+ const ITensorInfo *vector_sum_col = nullptr,
+ const ITensorInfo *vector_sum_row = nullptr,
+ const ITensorInfo *bias = nullptr,
+ const ITensorInfo *output_multipliers = nullptr,
+ const ITensorInfo *output_shifts = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _fuse_output_stage{false};
+ signed int _m{1};
+ signed int _n{1};
+ signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..d93dbde95a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ int32_t a_offset,
+ int32_t b_offset)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+ }
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if (b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d =
+ mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ TensorShape output_shape = mm_result->tensor_shape();
+ if (output_shape.num_dimensions() > 1)
+ {
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ output_shape.collapse_from(output_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+ "mm_result tensor must have the same number of batches of output tensor");
+
+ if (a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+
+ auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+ mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+ build_opts.add_option_if(reinterpret_as_3d,
+ "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+ build_opts.add_option_if(reinterpret_as_3d,
+ "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ std::string kernel_name("gemmlowp_offset_contribution");
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+ IClKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name + "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ int32_t a_offset,
+ int32_t b_offset)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+ return Status{};
+}
+
+void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+ const auto vector_sum_col =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+ const auto vector_sum_row =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+
+ Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Set window for vector_sum_col
+ Window win_vector_sum_col = slice;
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row = slice;
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window biases_slice = slice;
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, mm_result, slice);
+ add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+ add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+ add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..2080a3a091
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (vector_sum_col[k] * a_offset) +
+ * (vector_sum_row[i] * b_offset) +
+ * (a_offset * b_offset * k)
+ *
+ */
+class ClGemmLowpOffsetContributionKernel : public IClKernel
+{
+public:
+ ClGemmLowpOffsetContributionKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
+ * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+ * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] k Number of matrix A columns or Matrix B rows
+ * @param[in] a_offset Offset to be added to each element of the matrix A.
+ * @param[in] b_offset Offset to be added to each element of the matrix B.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ int32_t a_offset,
+ int32_t b_offset);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000000..26f479f61a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ int32_t a_offset,
+ int32_t b_offset,
+ const GEMMLowpOutputStageInfo &output_stage,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+ if (output_stage.is_quantized_per_channel)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
+ }
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if (b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d =
+ mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ TensorShape output_shape = mm_result->tensor_shape();
+ if (output_shape.num_dimensions() > 1)
+ {
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ output_shape.collapse_from(output_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+ "mm_result tensor must have the same number of batches of output tensor");
+
+ if (a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+ // Checks performed when output is configured
+ if ((dst != nullptr) && (dst->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+ "per channel quantization info is incorrect");
+
+ return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset,
+ const GEMMLowpOutputStageInfo &output_stage,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+ b_offset, output_stage, output_multipliers, output_shifts));
+
+ auto padding_info =
+ get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
+
+ const int min = output_stage.gemmlowp_min_bound;
+ const int max = output_stage.gemmlowp_max_bound;
+
+ _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+ mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Auto initialize the output
+ auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if (a_offset != 0)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+ build_opts.add_option_if(reinterpret_as_3d,
+ "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+ build_opts.add_option_if(reinterpret_as_3d,
+ "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+ build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+ build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+ build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+
+ PixelValue min_val{};
+ PixelValue max_val{};
+ std::tie(min_val, max_val) = get_min_max(dst->data_type());
+ build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+ std::string kernel_name("gemmlowp_offset_contribution");
+ kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name + "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ int32_t a_offset,
+ int32_t b_offset,
+ const GEMMLowpOutputStageInfo &output_stage,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+ b_offset, output_stage, output_multipliers, output_shifts));
+ return Status{};
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto mm_result =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ const auto vector_sum_col =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+ const auto vector_sum_row =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+ const auto output_shifts =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+ const auto output_multipliers =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Set window for vector_sum_col
+ Window win_vector_sum_col = slice;
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row = slice;
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window biases_slice = slice;
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, mm_result, slice);
+ add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+ add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+ add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+ add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000000..97ee9bc97f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
+ * of matrix A and matrix B and performs the output stage defined by the output_stage argument
+ *
+ * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
+ */
+class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel
+{
+public:
+ ClGemmLowpOffsetContributionOutputStageKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
+ * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+ * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+ * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] k Number of matrix A columns or Matrix B rows
+ * @param[in] a_offset Offset to be added to each element of the matrix A.
+ * @param[in] b_offset Offset to be added to each element of the matrix B.
+ * @param[in] output_stage GEMMLowp output stage info
+ * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+ * Supported data types: S32
+ * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+ * Supported data types: S32
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset,
+ const GEMMLowpOutputStageInfo &output_stage,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ int32_t a_offset,
+ int32_t b_offset,
+ const GEMMLowpOutputStageInfo &output_stage,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _is_quantized_per_channel{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..7b7beab12c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+
+ // Check biases if exist
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+ return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+ auto padding_info = get_padding_info({src, bias, dst});
+
+ // dst auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+ // Set the arguments to pass at compile time
+ auto min = info->gemmlowp_min_bound;
+ auto max = info->gemmlowp_max_bound;
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
+ build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
+ build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option_if(
+ (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+ (min != max),
+ "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if(
+ (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+ (min != max),
+ "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ // Create kernel
+ const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+ ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+ : "gemmlowp_output_stage_quantize_down_fixedpoint";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Create src window
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Setup bias slice
+ unsigned int idx1 = num_arguments_per_3D_tensor();
+ if (bias != nullptr)
+ {
+ Window biases_slice(slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ add_1D_tensor_argument(idx1, bias, biases_slice);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx1, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..71c9f4b752
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel
+{
+public:
+ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel);
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+ * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ * @param[in] info Output stage info. Used to pass the quantized output data type
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
new file mode 100644
index 0000000000..52ebd32d46
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+ (info->output_data_type != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ info->gemmlowp_max_bound >
+ std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ info->gemmlowp_min_bound <
+ std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+ info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+
+ // Check biases if exist
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+ return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+ auto padding_info = get_padding_info({src, bias, dst});
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+ auto min = info->gemmlowp_min_bound;
+ auto max = info->gemmlowp_max_bound;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
+ build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
+ build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ const std::string kernel_name = "gemmlowp_output_stage_quantize_down_float";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Create input window
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Setup bias slice
+ unsigned int idx1 = num_arguments_per_3D_tensor();
+ if (bias != nullptr)
+ {
+ Window biases_slice(slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ add_1D_tensor_argument(idx1, bias, biases_slice);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx1, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
new file mode 100644
index 0000000000..057c66767f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Requantize
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to
+ * - to the [0..255] range and cast to QASYMM8.
+ * - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel
+{
+public:
+ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+ * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] info Output stage info. Used to pass the quantized output data type
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
new file mode 100644
index 0000000000..31434ce61b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+ (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ output_stage->gemmlowp_max_bound >
+ std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ output_stage->gemmlowp_min_bound <
+ std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+ output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+
+ // Check biases if exist
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+ "Mismatching output data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ }
+
+ return Status{};
+}
+} //namespace
+
+ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
+
+ return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
+
+ auto padding_info = get_padding_info({src, bias, dst});
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+ // Set the arguments to pass at compile time
+ auto min = output_stage->gemmlowp_min_bound;
+ auto max = output_stage->gemmlowp_max_bound;
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
+ build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
+ build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+ output_stage->output_data_type))) &&
+ (min != max),
+ "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+ output_stage->output_data_type))) &&
+ (min != max),
+ "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ const std::string kernel_name = "gemmlowp_output_stage_quantize_down";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ unsigned int idx1 = num_arguments_per_3D_tensor();
+ if (bias != nullptr)
+ {
+ Window biases_slice(slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ add_1D_tensor_argument(idx1, bias, biases_slice);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx1, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000000..e6390801f1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values:
+ * -# -to the [0..255] range and cast to QASYMM8.
+ * -# -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel
+{
+public:
+ ClGemmLowpQuantizeDownInt32ScaleKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel);
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+ * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] output_stage GEMMLowp output stage metadata.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
new file mode 100644
index 0000000000..ee4a191fed
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8);
+
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->dimension(0) != src->dimension(1),
+ "Output vector must have length equal to the number of rows of the input matrix");
+ }
+ return Status{};
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->dimension(0) != src->dimension(0),
+ "Output vector must have length equal to the number of columns of the input matrix");
+ }
+ return Status{};
+}
+} // namespace
+
+IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mtx_a,
+ ITensorInfo *vector_sum_row,
+ const GEMMLowpReductionKernelInfo &info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
+
+ auto padding_info = get_padding_info({mtx_a, vector_sum_row});
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type()));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type()));
+ build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+ std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ // This kernel does not need padding
+ Window win = calculate_max_window(*vector_sum_row, Steps());
+ ICLKernel::configure_internal(win);
+
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mtx_a->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mtx_a->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mtx_a->dimension(2));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a,
+ const ITensorInfo *vector_sum_row,
+ const GEMMLowpReductionKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+ Window slice_in = collapsed.first_slice_window_2D();
+ Window slice_out = collapsed.first_slice_window_2D();
+
+ // Setup input slice. Its dimensions are increased in the cl kernel.
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice_in);
+ add_2D_tensor_argument(idx, dst, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
+ } while (collapsed.slide_window_slice_2D(slice_out));
+}
+
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mtx_b,
+ ITensorInfo *vector_sum_col,
+ const GEMMLowpReductionKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
+
+ auto padding_info = get_padding_info({mtx_b, vector_sum_col});
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
+ build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type()));
+ build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+ const std::string kernel_name = "gemmlowp_matrix_b_reduction";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration));
+ IClKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b,
+ const ITensorInfo *vector_sum_col,
+ const GEMMLowpReductionKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+ Window slice_out = collapsed.first_slice_window_2D();
+ Window slice_in = slice_out;
+
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice_in);
+ add_2D_tensor_argument(idx, dst, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
+ } while (collapsed.slide_window_slice_2D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
new file mode 100644
index 0000000000..c81543e4c2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Common interface for all OpenCL reduction kernels */
+class IClGemmLowpReductionKernel : public IClKernel
+{
+public:
+ IClGemmLowpReductionKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+ * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+ * @param[in] info Kernel metadata:
+ * - k Number of matrix columns/rows depending on the type of reduction.
+ * - is_reshaped True if the matrix has been reshaped.
+ * - scalar Scalar value to multiply each reduced column/row by.
+ * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+ */
+ virtual void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const GEMMLowpReductionKernelInfo &info) = 0;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+ * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+ * @param[in] info Kernel metadata:
+ * - k Number of matrix columns/rows depending on the type of reduction.
+ * - is_reshaped True if the matrix has been reshaped.
+ * - scalar Scalar value to multiply each reduced column/row by.
+ * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mtx_a,
+ ITensorInfo *vector_sum_row,
+ const GEMMLowpReductionKernelInfo &info) override;
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+ * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+ * @param[in] info Kernel metadata:
+ * - k Number of matrix columns/rows depending on the type of reduction.
+ * - is_reshaped True if the matrix has been reshaped.
+ * - scalar Scalar value to multiply each reduced column/row by.
+ * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *mtx_b,
+ ITensorInfo *vector_sum_col,
+ const GEMMLowpReductionKernelInfo &info) override;
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
new file mode 100644
index 0000000000..fd23aa9924
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+ (!gemm_info.broadcast_bias),
+ "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
+
+ const unsigned int m = gemm_info.m;
+ const unsigned int n = gemm_info.n;
+ const unsigned int k = gemm_info.k;
+
+ ARM_COMPUTE_UNUSED(m);
+ ARM_COMPUTE_UNUSED(n);
+ ARM_COMPUTE_UNUSED(k);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
+ if (gemm_info.reinterpret_input_as_3d)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+ }
+
+ if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+ {
+ const unsigned int src2_dim0 = src2->dimension(0);
+ const unsigned int src2_dim1 = src2->dimension(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+ if (gemm_info.broadcast_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+ "Incorrect dimension of bias matrix which is to be broadcasted");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+ }
+ }
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst =
+ dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info,
+ ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out =
+ calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+ AccessWindowStatic src1_access(
+ src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+ AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
+
+ if (src2 != nullptr)
+ {
+ const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
+
+ AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+ src2->dimension(1));
+
+ window_changed = update_window_and_padding(win, src0_access, src1_access,
+ src2_access) || // window used by the execute_window_loop
+ update_window_and_padding(
+ win_out, dst_access); // window used to update the padding requirements of dst tensor
+ }
+ else
+ {
+ window_changed =
+ update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out,
+ dst_access); // window used to update the padding requirements of dst tensor
+ }
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+ auto padding_info = get_padding_info({src0, dst});
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+ _add_bias = src2 != nullptr;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_src0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+ rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+ // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+ // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+ const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
+ const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+ const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+ // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+ // NOTE: This might have implications on heuristics and performance
+ const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+ _m = internal_m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+ "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+ build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+ std::string kernel_name("gemm_mm_native");
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_add_bias ? "add_bias_" : "");
+ _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+ _config_id += lower_string(string_from_data_type(src0->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.k0);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+ src2 != nullptr ? src2->clone().get() : nullptr,
+ dst->clone().get(), lhs_info, rhs_info, gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if (_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ unsigned int idx0;
+ if (_add_bias)
+ {
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7;
+ }
+ else
+ {
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6;
+ }
+ const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if (_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+ unsigned int idx0;
+ if (_add_bias)
+ {
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
+ }
+ else
+ {
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
+ }
+ const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src0, slice);
+ add_2D_tensor_argument(idx, src1, slice_b);
+ if (_add_bias)
+ {
+ add_2D_tensor_argument(idx, src2, slice);
+ }
+ add_2D_tensor_argument(idx, dst, slice);
+
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+ if (_add_bias)
+ {
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
+ }
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+
+ // Pass m, n and k at runtime
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000000..da6c9a5bb7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
+class ClGemmMatrixMultiplyNativeKernel : public IClKernel
+{
+public:
+ ClGemmMatrixMultiplyNativeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel);
+ /** Initialise the kernel's input and dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor for the LHS matrix. Data type supported: F32/F16. The number of dimensions for the LHS matrix must be less or equal than 4.
+ * @param[in] src1 Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+ * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0.
+ * @param[out] dst dst tensor info. Data type supported: same as @p src0
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the matrix bias
+ * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+ * lhs_info.m0: 1,2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+ * rhs_info.n0: 2,3,4,8,16
+ * rhs_info.k0: same of lhs_info.k0
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_input_as_3d{false};
+ bool _reinterpret_output_as_3d{false};
+ bool _use_dummy_work_items{false};
+ bool _add_bias{false};
+ signed int _m{1};
+ signed int _n{1};
+ signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000000..4fe6bddb36
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+ "Only 2,3,4,8,16 are supported for m0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+ (!gemm_info.broadcast_bias),
+ "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+ "Mixed precision only supported for F16 data type");
+ ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+ const unsigned int m = gemm_info.m;
+ const unsigned int n = gemm_info.n;
+ const unsigned int k = gemm_info.k;
+
+ TensorShape tensor_shape0{src0->tensor_shape()};
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+ {
+ const unsigned int src2_dim0 = src2->dimension(0);
+ const unsigned int src2_dim1 = src2->dimension(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+ if (gemm_info.broadcast_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+ "Incorrect dimension of bias matrix which is to be broadcasted");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+ }
+ }
+
+ const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 =
+ src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst =
+ dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info,
+ ElementsProcessed &num_elements_processed)
+{
+ ARM_COMPUTE_UNUSED(src0, src1, src2);
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ Window win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+ auto padding_info = get_padding_info({src0, src1, src2, dst});
+ _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+ _add_bias = src2 != nullptr;
+ _export_to_cl_image = rhs_info.export_to_cl_image;
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_src0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(
+ src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+ lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ const bool enable_mixed_precision = gemm_info.fp_mixed_precision;
+ const DataType data_type = src0->data_type();
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+ const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+ const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+ _m = gemm_info.m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+ "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+ build_opts.add_option_if(_reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+ build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
+ build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+ build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+ ? get_cl_type_from_data_type(DataType::F32)
+ : get_cl_type_from_data_type(data_type)));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+ std::string kernel_name("gemm_mm_reshaped_");
+ kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+ kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+ kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_add_bias ? "add_bias_" : "");
+ _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+ _config_id += lower_string(string_from_data_type(src0->data_type()));
+ _config_id += "_";
+ _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+ return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+
+ cl::Image2D src1_image2d;
+
+ if (_export_to_cl_image)
+ {
+ const TensorShape shape2d(src1->info()->dimension(0) / 4,
+ src1->info()->dimension(1) * src1->info()->dimension(2));
+ const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+ src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+ src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+
+ // LHS buffer
+ add_2D_tensor_argument(idx, src0, slice);
+
+ // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+ if (_export_to_cl_image)
+ {
+ _kernel.setArg(idx++, src1_image2d);
+ }
+ else
+ {
+ add_2D_tensor_argument(idx, src1, slice_b);
+ }
+
+ // Bias buffer (_add_bias == true)
+ add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
+
+ // dst buffer
+ add_2D_tensor_argument(idx, dst, slice);
+
+ // LHS stride_z
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+
+ // RHS stride_z (not used if _export_to_cl_image == true)
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+
+ // Bias stride_z (if _add_bias == true)
+ if (_add_bias)
+ {
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
+ }
+
+ // dst stride_z
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+
+ // Cross-plan padding (if _reinterpret_output_as_3d = true)
+ if (_reinterpret_output_as_3d)
+ {
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ // Pass m, n and k at runtime
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+
+ // K dimension (not used if _export_to_cl_image == true)
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ // Dispatch kernel
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000000..30928c4e1d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
+ *
+ * @note The input matrices @p src0 and @p src1 must be reshaped through:
+ * - @ref ClGemmReshapeLhsMatrixKernel
+ * - @ref ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmMatrixMultiplyReshapedKernel : public IClKernel
+{
+public:
+ ClGemmMatrixMultiplyReshapedKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+ * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+ * multiplications. i.e. float c = (half)a * (half)b
+ *
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+ * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+ * the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
+ * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
+ * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3
+ * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0.
+ * @param[out] dst dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the matrix bias
+ * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported:
+ * lhs_info.m0: 2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * lhs_info.transpose: false
+ * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported:
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+ * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+ * rhs_info.transpose: true
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ *
+ * @note lhs_info.k0 must be equal to rhs_info.k0
+ */
+ void configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_output_as_3d{false};
+ bool _use_dummy_work_items{false};
+ bool _add_bias{false};
+ bool _export_to_cl_image{false};
+ signed int _m{1};
+ signed int _n{1};
+ signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
new file mode 100644
index 0000000000..1b19f1ec5b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+ (!gemm_info.broadcast_bias),
+ "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+ ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+ const unsigned int m = gemm_info.m;
+ const unsigned int n = gemm_info.n;
+ const unsigned int k = gemm_info.k;
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+ {
+ const unsigned int src2_dim0 = src2->dimension(0);
+ const unsigned int src2_dim1 = src2->dimension(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
+ if (gemm_info.broadcast_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+ "Incorrect dimension of bias matrix which is to be broadcasted");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+ }
+ }
+
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+ if (gemm_info.reinterpret_input_as_3d)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst =
+ dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+
+ return Status{};
+}
+
+Window validate_and_configure_window(ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info,
+ ElementsProcessed &num_elements_processed)
+{
+ ARM_COMPUTE_UNUSED(src0, src1, src2);
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ // This approach should only be used when the input/dst tensors have pad on the y direction
+ if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ Window win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ Window collapsed = win.collapse(win, dimension_to_collapse);
+
+ return collapsed;
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
+ _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+ _add_bias = src2 != nullptr;
+ _export_to_cl_image = rhs_info.export_to_cl_image;
+ _has_pad_y = gemm_info.has_pad_y;
+
+ auto padding_info = get_padding_info({src0, src1, src2, dst});
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_src0 = src0->num_dimensions();
+ _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+ (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+ lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ICLKernel::configure_internal(win);
+
+ // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
+ // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+ // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+ // These variables are used only if gemm_info.has_pad_y == true
+ const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
+ const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
+
+ // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+ // NOTE: This might have implications on heuristics and performance
+ const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = internal_m % internal_m0;
+ const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+ _m = internal_m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+ "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+ build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+ build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ if (_has_pad_y)
+ {
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+ "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+ }
+
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+ std::string kernel_name("gemm_mm_reshaped_only_rhs_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+ kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_has_pad_y ? "" : "no_pad_y_");
+ _config_id += (_add_bias ? "add_bias_" : "");
+ _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+ _config_id += lower_string(string_from_data_type(src0->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+ return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
+ const size_t rhs_idx_batch_size = 2u;
+ const size_t bia_idx_batch_size = 2u;
+ const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ // Get cross plane pads
+ const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
+ const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom;
+
+ // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
+ ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
+
+ cl::Image2D src1_image2d;
+
+ if (_export_to_cl_image)
+ {
+ const TensorShape shape2d(src1->info()->dimension(0) / 4,
+ src1->info()->dimension(1) * src1->info()->dimension(2));
+ const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+ src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+ src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if (!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+
+ // LHS buffer
+ add_2D_tensor_argument(idx, src0, slice);
+
+ // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+ if (_export_to_cl_image)
+ {
+ _kernel.setArg(idx++, src1_image2d);
+ }
+ else
+ {
+ add_2D_tensor_argument(idx, src1, slice_b);
+ }
+
+ // Bias buffer (_add_bias == true)
+ add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
+
+ // dst buffer
+ add_2D_tensor_argument(idx, dst, slice);
+
+ // LHS stride_z
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
+
+ // RHS stride_z (not used if _export_to_cl_image == true)
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
+
+ // Bias stride_z (if _add_bias == true)
+ if (_add_bias)
+ {
+ _kernel.setArg<cl_uint>(idx++,
+ static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+ }
+
+ // dst stride_z
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
+
+ // Cross-plan padding (if _reinterpret_input_as_3d = true)
+ if (_reinterpret_input_as_3d && _has_pad_y)
+ {
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
+ }
+
+ // Cross-plan padding (if reinterpret_output_as_3d = true)
+ if (_reinterpret_output_as_3d && _has_pad_y)
+ {
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
+ }
+
+ // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed.
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
new file mode 100644
index 0000000000..e8fd78d476
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped
+ *
+ * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel
+{
+public:
+ ClGemmMatrixMultiplyReshapedOnlyRhsKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+ * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+ * the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
+ * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+ * The number of dimensions for the LHS matrix must be less or equal than 4.
+ * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+ * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0.
+ * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the matrix bias
+ * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
+ * lhs_info.m0: 1,2,3,4,5,6,7,8
+ * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported:
+ * rhs_info.k0: 2,3,4,8,16
+ * rhs_info.n0: 2,3,4,8,16
+ * rhs_info.transpose: true,false
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ */
+ void configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _slide_matrix_b{true};
+ bool _reinterpret_input_as_3d{false};
+ bool _reinterpret_output_as_3d{false};
+ bool _use_dummy_work_items{false};
+ bool _add_bias{false};
+ bool _export_to_cl_image{false};
+ bool _has_pad_y{false};
+ signed int _m{1};
+ signed int _n{1};
+ signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
new file mode 100644
index 0000000000..4ca4b83f9c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 4;
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+ "The extension cl_arm_matrix_multiply is not supported on the target platform");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+ "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+ "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+ rhs_info.n0 != 8 && rhs_info.n0 != 16,
+ "Only 1,2,3,4,8, and 16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+ "Only true is supported for interleave with mmul extension enabled");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+ "Only false is supported for transpose with mmul extension enabled");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+ ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+ const unsigned int m = gemm_info.m;
+ const unsigned int n = gemm_info.n;
+ const unsigned int k = gemm_info.k;
+
+ ARM_COMPUTE_UNUSED(m);
+ ARM_COMPUTE_UNUSED(n);
+ ARM_COMPUTE_UNUSED(k);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+
+ // Validate the reinterpreted-as-3D-case
+ if (gemm_info.reinterpret_input_as_3d)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+ }
+
+ // Validate the gemm-batched case
+ if (src1->num_dimensions() > 2)
+ {
+ if (gemm_info.depth_output_gemm3d != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(2) != src1->dimension(2));
+ }
+ }
+
+ if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+ {
+ const unsigned int src2_dim0 = src2->dimension(0);
+ const unsigned int src2_dim1 = src2->dimension(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+ if (gemm_info.broadcast_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+ "Incorrect dimension of bias matrix which is to be broadcasted");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+ }
+ }
+
+ TensorShape tensor_shape1{src1->tensor_shape()};
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+ const TensorInfo tensor_info_reshaped1 =
+ src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst =
+ dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(src0, src1, src2);
+ bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ TensorInfo tmp_info(*dst);
+
+ if (reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ Window win = calculate_max_window(tmp_info, Steps(1, 1));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+ Window collapsed = win.collapse(win, dimension_to_collapse);
+
+ // Reconfigure window size, one arm_matrix_multiply kernel needs 16 threads to finish.
+ Window::Dimension x_dimension = collapsed.x();
+ Window::Dimension y_dimension = collapsed.y();
+
+ // Make M and N multiple of M0 and N0 respectively
+ const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(x_dimension.end(), rhs_info.n0);
+ const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(y_dimension.end(), lhs_info.m0);
+
+ // Divide M and N by M0 and N0 respectively
+ const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / rhs_info.n0;
+ const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / lhs_info.m0;
+
+ // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_k0 respectively
+ const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+ const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
+
+ // Ensure x_dimension is multiple of MMUL block size (mmul_n0 * mmul_k0)
+ x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_k0);
+ y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0);
+
+ collapsed.set(Window::DimX, x_dimension);
+ collapsed.set(Window::DimY, y_dimension);
+
+ return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+ auto padding_info = get_padding_info({src0, src1, src2, dst});
+ _add_bias = src2 != nullptr;
+ _export_to_cl_image = rhs_info.export_to_cl_image;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ IClKernel::configure_internal(win_config.second);
+
+ _m = gemm_info.m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
+
+ const unsigned int m0_leftover = _m % lhs_info.m0;
+ const unsigned int n0_leftover = _n % rhs_info.n0;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+ build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+ "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+ build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+ build_opts.add_option_if(src0->data_type() == DataType::F16, "-DHALF_PRECISION");
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+ build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+ build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+ build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+ build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+ build_opts.add_option("-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+ build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS");
+
+ std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul");
+ kernel_name += _export_to_cl_image ? "_texture" : "";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_add_bias ? "add_bias_" : "");
+ _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+ _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+ _config_id += lower_string(string_from_data_type(src0->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_m);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_n);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+ src2 != nullptr ? src2->clone().get() : nullptr,
+ dst->clone().get(), lhs_info, rhs_info, gemm_info)
+ .first);
+
+ return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+ if (src1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ cl::Image2D src1_image2d;
+
+ if (_export_to_cl_image)
+ {
+ const TensorShape shape2d(src1->info()->dimension(0) / 4,
+ src1->info()->dimension(1) * src1->info()->dimension(2));
+ const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+ src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+ src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+
+ add_3d_tensor_nhw_argument(idx, src0);
+ if (_export_to_cl_image)
+ {
+ _kernel.setArg(idx++, src1_image2d);
+ }
+ add_3d_tensor_nhw_argument(idx, src1);
+
+ // Bias buffer (_add_bias == true)
+ if (_add_bias)
+ {
+ add_3d_tensor_nhw_argument(idx, src2);
+ }
+ // dst buffer
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ // Pass m, n and k at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+ // LWS also enforces the order of execution of the workitems which improves cache utilization
+ enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
new file mode 100644
index 0000000000..86d3012f6e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices using MMUL when only the input matrix RHS (src1) has been reshaped */
+class ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel : public IClKernel
+{
+public:
+ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel);
+ /** Initialize the kernel's input and dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src0 Input tensor for the LHS matrix. Data type supported: F16/F32.
+ * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0.
+ * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0.
+ * @param[out] dst dst tensor info. Data type supported: same as @p src0
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the matrix bias
+ * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+ * lhs_info.m0 > 0
+ * lhs_info.k0: 1
+ * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+ * rhs_info.n0: 1,2,3,4,8,16
+ * rhs_info.k0: same of lhs_info.k0
+ * rhs_info.transpose: false
+ * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src0,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float alpha,
+ float beta,
+ const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMKernelInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _add_bias{false};
+ bool _export_to_cl_image{false};
+ signed int _m{1};
+ signed int _n{1};
+ signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
new file mode 100644
index 0000000000..eea2a169a3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+ "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(),
+ misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ }
+
+ return Status{};
+}
+
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+ const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
+ const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ TensorInfo tmp_info(*src);
+
+ if (reinterpret_input_as_3d)
+ {
+ // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(src->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // dst auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+ *src, lhs_info, reinterpret_input_as_3d)));
+
+ // Configure window
+ Window win =
+ calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
+
+ return collapsed;
+}
+} // namespace
+
+ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
+
+ auto padding_info = get_padding_info({src});
+
+ const unsigned int src_w = src->dimension(0);
+ const unsigned int m = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
+ const unsigned int partial_m0 = m % lhs_info.m0;
+ const unsigned int partial_k0 = src_w % lhs_info.k0;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
+ build_opts.add_option_if_else(lhs_info.transpose, "-DRESHAPE_LHS_T", "-DRESHAPE_LHS_NT");
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+ build_opts.add_option("-DPARTIAL_M0=" + support::cpp11::to_string(partial_m0));
+ build_opts.add_option("-DPARTIAL_K0=" + support::cpp11::to_string(partial_k0));
+
+ std::string kernel_name("gemm_reshape_lhs_matrix_");
+ kernel_name += lhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win_config = configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
+ ICLKernel::configure_internal(win_config);
+
+ unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+ _kernel.setArg<cl_int>(idx++, m);
+ _kernel.setArg<cl_int>(idx++, lhs_info.v0);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemm_reshape_lhs_matrix_";
+ _config_id += (reinterpret_input_as_3d ? "3d_" : "");
+ _config_id += lower_string(string_from_data_type(src->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.transpose);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
+ return Status{};
+}
+
+void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3d_tensor_nhw_argument(idx, src);
+ add_3d_tensor_nhw_argument(idx, dst);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
new file mode 100644
index 0000000000..8e84e8ad8e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
+ * In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
+ * stores each one in the dst matrix unrolling the values
+ */
+class ClGemmReshapeLhsMatrixKernel : public ICLKernel
+{
+public:
+ ClGemmReshapeLhsMatrixKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Input tensor. Data types supported: All
+ * @param[out] dst Output tensor. Data type supported: same as @p src
+ * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary
+ * information to reshape the src tensor. Only the following values are supported:
+ * lhs_info.m0: 2,3,4,5,6,7,8
+ * lhs_info.k0: 2,3,4,8,16
+ * lhs_info.v0: greater than 0
+ * lhs_info.transpose: true, false
+ * lhs_info.interleave: true, false
+ * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ bool reinterpret_src_as_3d = false);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLHSMatrixInfo &lhs_info,
+ bool reinterpret_src_as_3d);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
new file mode 100644
index 0000000000..b9ce3873c7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+ "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+ "Only 1,2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ if (rhs_info.export_to_cl_image)
+ {
+ const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+ src->data_type());
+ ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
+ }
+
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+{
+ const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
+ const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
+ bool window_changed = false;
+
+ // dst auto initialization if not yet initialized
+ auto_init_if_empty(
+ *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+
+ // Configure window
+ Window win =
+ calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
+
+ window_changed = update_window_and_padding(win, src_access);
+
+ if (rhs_info.export_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(dst);
+ }
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info));
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
+ build_opts.add_option_if(rhs_info.transpose, "-DRESHAPE_RHS_T");
+ build_opts.add_option_if(!rhs_info.transpose, "-DRESHAPE_RHS_NT");
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+
+ std::string kernel_name("gemm_reshape_rhs_matrix_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src, dst, rhs_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+ _kernel.setArg<cl_int>(idx++, rhs_info.h0);
+}
+
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
+
+ return Status{};
+}
+
+void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3d_tensor_nhw_argument(idx, src);
+ add_3d_tensor_nhw_argument(idx, dst);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
new file mode 100644
index 0000000000..7203d574fb
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
+ * In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in
+ * the dst matrix unrolling the values */
+class ClGemmReshapeRhsMatrixKernel : public ICLKernel
+{
+public:
+ ClGemmReshapeRhsMatrixKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+ * required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
+ * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32, F16
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ * -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Input tensor. Data types supported: All
+ * @param[out] dst Output tensor. Data type supported: same as @p src
+ * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
+ * information to reshape the src tensor. Only the following values are supported:
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+ * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+ * rhs_info.h0: greater than 0
+ * rhs_info.transpose: true, false
+ * rhs_info.interleave: true, false
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMRHSMatrixInfo &rhs_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
new file mode 100644
index 0000000000..2e1cefc6e7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
+ for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
+ return Status{};
+}
+
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ unsigned int height_offset,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ _height_offset = height_offset;
+
+ // Add build options
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+
+ if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+ _depth = src->dimension(2);
+
+ std::string kernel_name = "concatenate_height";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ // Configure kernel window
+
+ // The window needs to be based on src as we copy all the heights of src
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, window);
+ add_4D_tensor_argument(idx, dst, window);
+ _kernel.setArg<cl_uint>(idx++, _depth);
+ enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
new file mode 100644
index 0000000000..5a391a1212
--- /dev/null
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the height concatenate kernel.
+ * The source tensor will be concatenated into the destination tensor.
+ */
+class ClHeightConcatenateKernel : public IClKernel
+{
+public:
+ ClHeightConcatenateKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
+ /** Initialise the kernel's source and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: All.
+ * @param[in] height_offset The starting offset on the Y axis for the dst tensor.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ *
+ */
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClHeightConcatenateKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ unsigned int _height_offset;
+ int32_t _depth{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
new file mode 100644
index 0000000000..ef7a52828f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+struct Im2ColConfiguration
+{
+ std::string kernel_name{};
+ std::set<std::string> build_options{};
+ unsigned int num_elems_processed_per_iteration{};
+ bool is_padding_required_nchw{};
+};
+
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups)
+{
+ const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::NHWC && num_groups > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(channel_idx) % num_groups) != 0);
+
+ // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
+ const unsigned int width_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+ const unsigned total_width = src->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+ const unsigned total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
+ ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
+
+ if (dst->total_size() > 0)
+ {
+ const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+ compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_elems_processed_per_iteration,
+ bool is_padding_required_nchw,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape =
+ compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
+
+ const DataLayout data_layout = src->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = src->dimension(width_idx);
+ const unsigned int input_height = src->dimension(height_idx);
+
+ // Configure the execute window based on the selected optimal OpenCL kernel
+ bool window_changed = false;
+ Window win;
+
+ if (data_layout == DataLayout::NHWC)
+ {
+ win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ }
+ else
+ {
+ if (is_padding_required_nchw)
+ {
+ const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+ conv_info.pad_left());
+ win = calculate_max_window(
+ *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+ AccessWindowStatic input_access(
+ src, -border.left, -border.top,
+ ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+ input_height + border.bottom);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+ else
+ {
+ // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
+ // update_window_and_padding() can be skipped
+ win = calculate_max_window(*src, Steps());
+ }
+ }
+
+ // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+ win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups)
+{
+ const DataLayout data_layout = src->data_layout();
+ const DataType data_type = src->data_type();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int input_width = src->dimension(width_idx);
+ const unsigned int input_height = src->dimension(height_idx);
+ const unsigned int input_channel = src->dimension(channel_idx);
+
+ const std::pair<unsigned int, unsigned int> convolved_dims =
+ scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+
+ // Im2Col configuration
+ std::string kernel_name = "im2col_generic_";
+ CLBuildOptions build_opts;
+ unsigned int num_elems_processed_per_iteration = 1;
+ bool is_padding_required_nchw = false;
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
+ build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+ build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+ build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
+ build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+ build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+ build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+ build_opts.add_option_if_else(is_data_type_quantized(data_type),
+ "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+ build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+
+ if (data_layout == DataLayout::NHWC)
+ {
+ num_elems_processed_per_iteration = std::min(2U, input_channel);
+ is_padding_required_nchw = false;
+
+ // Only the 3x3 and 9x9 cases are optimized for NHWC
+ if (kernel_dims == Size2D(3U, 3U))
+ {
+ kernel_name = "im2col3x3_";
+ build_opts.add_option("-DIM2COL_3X3");
+ }
+ else if (kernel_dims == Size2D(9U, 9U))
+ {
+ kernel_name = "im2col9x9_";
+ build_opts.add_option("-DIM2COL_9X9");
+ }
+ else
+ {
+ build_opts.add_option("-DIM2COL_GENERIC");
+ }
+
+ // Get boundary vector (the first/last vector with potentially a partial vector size) size
+ // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
+ // otherwise, the boundary vec size is the (partial) remainder vector size
+ const unsigned int vec_size = num_elems_processed_per_iteration;
+ const unsigned int partial_vec_size = input_channel % vec_size;
+ const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
+ }
+ else
+ {
+ if (dilation == Size2D(1U, 1U))
+ {
+ const bool squared_im2col = kernel_dims.width == kernel_dims.height;
+ if (squared_im2col)
+ {
+ // Check if we can run an optimized im2col for NCHW
+ switch (kernel_dims.width)
+ {
+ case 1:
+ // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+ if (conv_info.stride().first == 1 && !conv_info.has_padding())
+ {
+ kernel_name = "im2col1x1_stridex1_";
+ num_elems_processed_per_iteration = 4;
+ is_padding_required_nchw = true;
+ }
+ break;
+ case 3:
+ kernel_name = "im2col3x3_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
+ break;
+ case 5:
+ kernel_name = "im2col5x5_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
+ break;
+ case 11:
+ // Optimized im2col11x11 if pad_x = pad_y = 0
+ if (!conv_info.has_padding())
+ {
+ kernel_name = "im2col11x11_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
+ }
+ break;
+ default:
+ kernel_name = "im2col_generic_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
+ break;
+ }
+ }
+ else if (kernel_dims.width > 1 && !conv_info.has_padding())
+ {
+ kernel_name = "im2col_generic_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
+
+ // Optimized im2col is performed using one or more vector operations with the specified vector size
+ // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+ // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+ // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+ // Using the vector size of 8, however, may be faster.
+ // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+ // is used instead.)
+ const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width);
+ const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
+ }
+ }
+ }
+
+ // Append the data layout to the kernel_name
+ kernel_name += lower_string(string_from_data_layout(data_layout));
+
+ Im2ColConfiguration im2col_config;
+ im2col_config.kernel_name = kernel_name;
+ im2col_config.build_options = build_opts.options();
+ im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+ im2col_config.is_padding_required_nchw = is_padding_required_nchw;
+
+ return im2col_config;
+}
+} // namespace
+
+ClIm2ColKernel::ClIm2ColKernel()
+ : _data_layout(DataLayout::UNKNOWN),
+ _convolved_dims(),
+ _num_elems_processed_per_iteration(1),
+ _kernel_dims(),
+ _conv_info(),
+ _num_groups()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
+
+ auto padding_info = get_padding_info({src, dst});
+ _data_layout = src->data_layout();
+
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = src->dimension(width_idx);
+ const unsigned int input_height = src->dimension(height_idx);
+
+ // Select and configure the optimal OpenCL kernel to run.
+ // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
+ // and the padding requirement flag
+ Im2ColConfiguration im2col_config =
+ configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
+
+ _convolved_dims =
+ scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+ _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
+ _kernel_dims = kernel_dims; // Only needed by the Tuner
+ _conv_info = conv_info; // Only needed by the Tuner
+ _num_groups = num_groups;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+ im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw, num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = im2col_config.kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(src->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(num_groups);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(_data_layout));
+
+ ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
+}
+
+Status ClIm2ColKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
+ Im2ColConfiguration im2col_config =
+ configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+ conv_info, has_bias, dilation,
+ im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw, num_groups)
+ .first);
+ return Status{};
+}
+
+void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+ // Get initial windows
+ // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ Window window_output;
+ window_output.use_tensor_dimensions(dst->info()->tensor_shape());
+
+ const Window first_slice_3d = window_collapsed.first_slice_window_3D();
+
+ Window slice = first_slice_3d;
+ Window slice_in = first_slice_3d;
+ Window slice_out = window_output.first_slice_window_2D();
+
+ if (_data_layout == DataLayout::NHWC)
+ {
+ const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
+ const int num_batches = tmp_win[3].end();
+
+ slice.set(1, Window::Dimension(0, static_cast<int>(dst->info()->tensor_shape()[1]), 1));
+ slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
+ }
+ else
+ {
+ slice.set(0,
+ Window::Dimension(
+ 0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+ _num_elems_processed_per_iteration));
+ slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+ // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
+ }
+
+ // Setup input slice
+ // The dimensions of the input are increased within the OpenCL kernel
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Setup output slice
+ // The dimensions of the output are increased within the OpenCL kernel
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ unsigned int idx = num_arguments_per_3D_tensor() +
+ (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++,
+ static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice_in);
+ if (_num_groups == 1)
+ {
+ add_2D_tensor_argument(idx, dst, slice_out);
+ }
+ else
+ {
+ add_3D_tensor_argument(idx, dst, slice_out);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+ window_collapsed.slide_window_slice_3D(slice_in));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
new file mode 100644
index 0000000000..c8cd5b328d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_IM2COL_KERNEL_H
+#define ARM_COMPUTE_CL_IM2COL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClIm2ColKernel : public IClKernel
+{
+public:
+ /** Default constructor */
+ ClIm2ColKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIm2ColKernel);
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[out] dst The output tensor info. First 2 lower dimensions represent a transform of each 3D input,
+ * while every dimension above represents a batch. Data types supported: Same as @p input
+ * @param[in] kernel_dims The kernel dimensions (width and height).
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] has_bias In case biases are provided expands the matrix with 1.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClIm2ColKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ DataLayout _data_layout;
+ std::pair<unsigned int, unsigned int> _convolved_dims;
+ unsigned int _num_elems_processed_per_iteration;
+ Size2D _kernel_dims;
+ PadStrideInfo _conv_info;
+ unsigned int _num_groups;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_IM2COL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
new file mode 100644
index 0000000000..8c493d08c6
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+ "M0 can only be greater than 0 and less than or equal to 8");
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(),
+ misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+ weights->tensor_shape(), conv_info, desc));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculationKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
+
+ constexpr unsigned int width_idx = 1;
+ constexpr unsigned int height_idx = 2;
+
+ // Get dst shape
+ TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+ src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
+
+ TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, output_shape, 1, DataType::S32);
+
+ // Configure kernel window
+ Window win;
+
+ // Create window and update padding
+ win = calculate_max_window(output_shape, Steps(1));
+
+ ICLKernel::configure_internal(win);
+
+ std::stringstream kernel_name;
+ CLBuildOptions build_options;
+
+ kernel_name << "indirect_convolution_address_precalculation";
+
+ const unsigned int pad_left = conv_info.pad_left();
+ const unsigned int pad_top = conv_info.pad_top();
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const auto dst_data_type = dst->data_type();
+
+ build_options.add_option("-DSRC_CONV_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+ build_options.add_option("-DSRC_CONV_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+ build_options.add_option("-DDST_CONV_WIDTH=" + support::cpp11::to_string(output_conv_shape[width_idx]));
+ build_options.add_option("-DDST_CONV_HEIGHT=" + support::cpp11::to_string(output_conv_shape[height_idx]));
+ build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+ build_options.add_option("-DWEI_CONV_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+ build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+ build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+ build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+ build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+ build_options.add_option("-DM0=" + support::cpp11::to_string(desc.m0));
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_options.add_option("-D" + upper_string(kernel_name.str()));
+
+ _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+ // Since this kernel should be called only once, we do not need to set the config_id for tuning
+}
+
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
+ return Status{};
+}
+
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Get initial windows
+ const Window slice = window.first_slice_window_3D();
+
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ unsigned int idx = 0;
+ add_4d_tensor_nhwc_argument(idx, dst);
+ enqueue(queue, *this, slice);
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
new file mode 100644
index 0000000000..b565609c6a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H
+#define ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the direct convolution kernel. */
+class ClIndirectConv2dAddressPrecalculationKernel : public IClKernel
+{
+public:
+ ClIndirectConv2dAddressPrecalculationKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIndirectConv2dAddressPrecalculationKernel);
+ /** Set the src, weights, biases and dst tensors info.
+ *
+ * @note: When M0 is 5,6,7, the kernel rounds up M0 to the nearest power of two. Therefore, eight. The reason behind
+ * this implementation detail is because we can exploit native opencl stores in the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [IFM, width, height],
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [IFM, kernel_x, kernel_y, OFM].
+ * The 1st dimension must be the same as the src's volume 1st dimension.
+ * Data type supported:Same as @p src.
+ * @param[out] dst Output tensor info where to store the precalculated offsets. Data types supported: S32.
+ * The output is a 3D tensor with the following dimensions: [M0 x Kw x Kh, ceil(M/M0), batch-size], where:
+ * Kw=Kernel width, Kh=Kernel height, M0=number of rows processed by each workitem, and M=dst_width x dst_height.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] desc Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const DirectConvComputeKernelInfo &desc);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const DirectConvComputeKernelInfo &desc);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
new file mode 100644
index 0000000000..3510b6970c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *indirect_buffer,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ indirect_buffer->tensor_shape(),
+ misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+ weights->tensor_shape(), conv_info, desc));
+
+ constexpr int channel_idx = 0;
+ constexpr int batch_idx = 3;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+ "M0 can only be greater than 0 and less than or equal to 8");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+ desc.n0 != 16,
+ "N0 can only be: 1, 2, 3, 4, 8, and 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+ desc.k0 != 16,
+ "K0 can only be: 1, 2, 3, 4, 8, and 16");
+
+ if (desc.export_weights_to_cl_image)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
+ "Export to CLImage is not supported for this weight configuration");
+ }
+
+ if (biases != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClIndirectConv2dKernel::ClIndirectConv2dKernel()
+{
+ _type = CLKernelType::DIRECT;
+}
+
+void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *biases,
+ ITensorInfo *indirect_buffer,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
+
+ // Perform validation
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+
+ constexpr unsigned int channel_idx = 0;
+ constexpr unsigned int width_idx = 1;
+ constexpr unsigned int height_idx = 2;
+ const unsigned int kernel_width = weights->dimension(width_idx);
+ const unsigned int kernel_height = weights->dimension(height_idx);
+ const DataType data_type = src->data_type();
+
+ const GPUTarget gpu_target = get_target();
+
+ // Get dst shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
+
+ // Configure kernel window
+ Window win;
+ output_shape.collapse(2U, 1U);
+ const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
+ const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]);
+ const unsigned int k0 = adjust_vec_size(desc.k0, src->dimension(channel_idx));
+
+ const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
+
+ // Create window and update padding
+ win = calculate_max_window(output_shape, Steps(n0, m0));
+
+ ICLKernel::configure_internal(win);
+
+ std::stringstream kernel_name;
+ CLBuildOptions build_options;
+
+ kernel_name << "indirect_convolution_nhwc";
+
+ _export_to_cl_image = desc.export_weights_to_cl_image;
+
+ // Update the padding for the weights tensor if we can export to cl_image
+ if (_export_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(weights);
+ }
+
+ // Add padding to indirect buffer to avoid out-of-bound reads
+ // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
+ const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
+ const unsigned int indirect_buf_width = indirect_buffer->tensor_shape()[0];
+ const unsigned int round_up_width =
+ ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+ const unsigned int padding = round_up_width - indirect_buf_width;
+ indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
+
+ if (biases != nullptr)
+ {
+ build_options.add_option(std::string("-DHAS_BIAS"));
+ build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+ }
+
+ // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+ const auto act_function = act_info.activation();
+
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (data_type == DataType::F32 || data_type == DataType::F16))
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_options.add_option("-cl-unsafe-math-optimizations");
+ }
+ else
+ {
+ build_options.add_option("-cl-fast-relaxed-math");
+ }
+
+ build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+ build_options.add_option("-DOFF_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
+ build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
+ build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(kernel_width));
+ build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(kernel_height));
+ build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_options.add_option("-DIND_BUFF_VEC_SIZE=" + support::cpp11::to_string(load_indirect_buf_size));
+ build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
+ build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+ build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_options.add_option("-D" + upper_string(kernel_name.str()));
+
+ if (compile_context.get_ddk_version() >= 30)
+ {
+ build_options.add_option("-fregister-allocation=64");
+ }
+
+ _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name.str();
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_width);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_height);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(width_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(height_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(channel_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(width_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(height_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
+}
+
+Status ClIndirectConv2dKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *indirect_buffer,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+ return Status{};
+}
+
+void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto weights =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto biases =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ const auto indirect_buffer =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ cl::Image2D weights_cl_image;
+
+ if (_export_to_cl_image)
+ {
+ const size_t image_w = weights->info()->dimension(0) / 4;
+ const size_t image_h =
+ weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = weights->info()->strides_in_bytes()[1];
+
+ // Export cl_buffer to cl_image
+ weights_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+ weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ unsigned int idx = 0;
+ add_4d_tensor_nhwc_argument(idx, src);
+ add_4d_tensor_nhwc_argument(idx, indirect_buffer);
+ add_4d_tensor_nhwc_argument(idx, dst);
+ if (_export_to_cl_image)
+ {
+ _kernel.setArg(idx++, weights_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, weights);
+ if (biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, biases, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
new file mode 100644
index 0000000000..04166d417e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the indirect convolution kernel. */
+class ClIndirectConv2dKernel : public IClKernel
+{
+public:
+ ClIndirectConv2dKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIndirectConv2dKernel);
+ /** Set the src, offset, weights, biases and dst tensors info.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [IFM, width, height],
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+ * @param[in] off The indirect buffer tensor info. Data types supported: S32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [IFM, kernel_x, kernel_y, OFM].
+ * Data type supported: Same as @p src.
+ * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM].
+ * Data type supported: Same as @p src.
+ * @param[out] dst Output tensor info.
+ * The 3rd dimension must be equal to the 4th dimension of the @p weights tensor. Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo.
+ * @param[in] desc Direct convolution descriptor used to build the NHWC indirect convolution kernel.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *off,
+ ITensorInfo *weights,
+ ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClIndirectConv2dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *off,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ const DirectConvComputeKernelInfo &desc);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ bool _export_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
new file mode 100644
index 0000000000..0bb6b0c083
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+ const bool adj_rhs = matmul_kernel_info.adj_rhs;
+ const int m0 = matmul_kernel_info.m0;
+ const int n0 = matmul_kernel_info.n0;
+ const int k0 = matmul_kernel_info.k0;
+
+ // Validate M0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+ if (adj_lhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+ "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+ }
+
+ // Validate N0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+ "Only 1,2,3,4,8,16 are supported for N0");
+
+ // Validate K0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+ if (!adj_lhs || adj_rhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+ "Only 1,2,3,4,8,16 are supported for K0");
+ }
+
+ return Status{};
+}
+} // namespace
+ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+ act_info.activation() != ActivationFunction::RELU &&
+ act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+ act_info.activation() != ActivationFunction::BOUNDED_RELU),
+ "Activation Function specified is unsupported.");
+ const TensorShape expected_output_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
+ }
+
+ return Status{};
+}
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+ lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
+ const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+
+ int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
+ int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
+ win = win.collapse(win, Window::DimZ);
+ IClKernel::configure_internal(win);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = m % m0;
+ const unsigned int partial_store_n0 = n % n0;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+
+ const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
+ const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
+ const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
+
+ float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+ build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+
+ // Note : Offset is not negated, unlike gemmlowp kernels
+ build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
+ build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
+ build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
+ build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+ // Floating point boundaries are quantized prior to being passed as arguments.
+ // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+ int a_val{};
+ int b_val{};
+ std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, dst->data_type(), dqinfo);
+
+ build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DZERO_POINT=" + support::cpp11::to_string(dqinfo.offset));
+
+ std::string kernel_name("mat_mul_native_quantized");
+ kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+ kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
+
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(lhs->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(number_of_batches);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const ICLTensor *lhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const ICLTensor *bias =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+ unsigned int idx = 0;
+ Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+ add_3d_tensor_nhw_argument(idx, lhs);
+ add_3d_tensor_nhw_argument(idx, rhs);
+ if (bias != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, bias);
+ }
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
new file mode 100644
index 0000000000..ffdb720855
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declerations
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulLowpNativeKernel : public IClKernel
+{
+public:
+ ClMatMulLowpNativeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor info for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor info for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] bias Bias tensor info. Can be nullptr. Data type supported: S32.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info (Optional) Class containing information about fused activation function.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMatMulLowpNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL */
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
new file mode 100644
index 0000000000..1df0ca0410
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 16;
+
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+ const int m0 = matmul_kernel_info.m0;
+ const int n0 = matmul_kernel_info.n0;
+ const int k0 = matmul_kernel_info.k0;
+
+ // Validate M0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+ if (adj_lhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+ "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+ }
+
+ // Validate N0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+ "Only 1,2,3,4,8,16 are supported for N0");
+
+ // Validate K0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
+
+ // Validate ExportToCLImage
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(matmul_kernel_info.export_rhs_to_cl_image, "Export to CLImage is not supported!");
+
+ return Status{};
+}
+} // namespace
+
+ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+ "The extension cl_arm_matrix_multiply is not supported on the target platform");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+
+ const TensorShape &lhs_shape = lhs->tensor_shape();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+ const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
+ "Activation Function specified is unsupported.");
+ const TensorShape expected_output_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
+ }
+
+ return Status{};
+}
+
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+ lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+ ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
+ CLBuildOptions build_opts;
+
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
+ const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+ const int m0 = std::min(matmul_kernel_info.m0, m);
+ const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks
+ // at the end of a row/column if any. This is to avoid padding.
+ const unsigned int m0_leftover = m % m0;
+ const unsigned int n0_leftover = n % n0;
+
+ // Configure kernel window
+ const auto win_config =
+ validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+ build_opts.add_option("-DM=" + support::cpp11::to_string(m));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(n));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+ build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+ build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+ build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+ build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+ build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+ build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+ const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
+ const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
+ const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
+
+ float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+ build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+
+ // Note : Offset is not negated, unlike gemmlowp kernels
+ build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
+ build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
+ build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
+
+ std::string kernel_name("mat_mul_native_quantized_mmul");
+ kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+ kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(lhs->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n0);
+}
+
+void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto *lhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto *rhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+ tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+ auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+ unsigned int idx = 0;
+ add_3d_tensor_nhw_argument(idx, lhs);
+ add_3d_tensor_nhw_argument(idx, rhs);
+
+ if (bias != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, bias);
+ }
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+ // LWS also enforces the order of execution of the work items which improves cache utilization
+ enqueue(queue, *this, window, cl::NDRange(32, 2), false);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
new file mode 100644
index 0000000000..e84d46d34a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declerations
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulLowpNativeMMULKernel : public IClKernel
+{
+public:
+ ClMatMulLowpNativeMMULKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeMMULKernel);
+
+ /** Initialise the kernel's input and output.
+ *
+ * Similar to @ref ClMatMulLowpNativeKernel::configure()
+ *
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMatMulLowpNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
new file mode 100644
index 0000000000..a1fa9fa9ab
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+ const bool adj_rhs = matmul_kernel_info.adj_rhs;
+ const int m0 = matmul_kernel_info.m0;
+ const int n0 = matmul_kernel_info.n0;
+ const int k0 = matmul_kernel_info.k0;
+
+ // Validate M0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+ if (adj_lhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+ "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+ }
+
+ // Validate N0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+ "Only 1,2,3,4,8,16 are supported for N0");
+
+ // Validate K0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+ if (!adj_lhs || adj_rhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+ "Only 1,2,3,4,8,16 are supported for K0");
+ }
+
+ return Status{};
+}
+
+Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
+ if (matmul_kernel_info.export_rhs_to_cl_image)
+ {
+ if (matmul_kernel_info.adj_rhs)
+ {
+ const int k0 = matmul_kernel_info.k0;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+ "K0 can only be: 4, 8, and 16 for Rhs transposed");
+ }
+ else
+ {
+ const int n0 = matmul_kernel_info.n0;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+ "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+ "Export to CLImage is not supported for this device/configuration");
+ }
+
+ return Status{};
+}
+} // namespace
+ClMatMulNativeKernel::ClMatMulNativeKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
+
+ const TensorShape expected_output_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+ "First dimension of bias and output tensors must match.");
+ }
+
+ return Status{};
+}
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+ lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
+ const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+
+ int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
+ int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+ _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings();
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
+ win = win.collapse(win, Window::DimZ);
+ IClKernel::configure_internal(win);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = m % m0;
+ const unsigned int partial_store_n0 = n % n0;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+ build_opts.add_option_if(bias != nullptr, "-DBIAS");
+ build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
+
+ // Define values for activation function
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
+ std::string kernel_name("mat_mul_native");
+ kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+ kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ if (_export_rhs_to_cl_image)
+ {
+ gemm::update_padding_for_cl_image(rhs);
+ }
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(lhs->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_export_rhs_to_cl_image);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const ICLTensor *lhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+ tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+ unsigned int idx = 0;
+ Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+ add_3d_tensor_nhw_argument(idx, lhs);
+
+ cl::Image2D rhs_cl_image;
+ if (_export_rhs_to_cl_image)
+ {
+ const size_t image_w = rhs->info()->dimension(0) / 4;
+ const size_t image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = rhs->info()->strides_in_bytes()[1];
+
+ // Export cl_buffer to cl_image
+ rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+ rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ _kernel.setArg(idx++, rhs_cl_image);
+ }
+
+ add_3d_tensor_nhw_argument(idx, rhs);
+ if (bias != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, bias);
+ }
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
new file mode 100644
index 0000000000..2cb150bc8f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulNativeKernel : public IClKernel
+{
+public:
+ ClMatMulNativeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor info for the LHS matrix. Data type supported: F32/F16.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor info for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] bias Bias tensor info for bias matrix. Can be nullptr. Data type supported: same as @p lhs.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMatMulNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _export_rhs_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL */
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
new file mode 100644
index 0000000000..76bf846e74
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 4;
+
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+ const bool adj_lhs = matmul_kernel_info.adj_lhs;
+ const int m0 = matmul_kernel_info.m0;
+ const int n0 = matmul_kernel_info.n0;
+ const int k0 = matmul_kernel_info.k0;
+
+ // Validate M0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+ if (adj_lhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+ "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+ }
+
+ // Validate N0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+ "Only 1,2,3,4,8,16 are supported for N0");
+
+ // Validate K0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
+
+ return Status{};
+}
+} // namespace
+ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
+{
+ _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+ "The extension cl_arm_matrix_multiply is not supported on the target platform");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+
+ const TensorShape &lhs_shape = lhs->tensor_shape();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+ const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+ const TensorShape expected_output_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+ }
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+ "First dimension of bias and output tensors must match.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
+ }
+
+ return Status{};
+}
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+ lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
+ const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+ _m = m;
+ _n = n;
+ _k = k;
+
+ const int m0 = std::min(matmul_kernel_info.m0, m);
+ const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+ // Configure kernel window
+ const auto win_config =
+ validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int m0_leftover = m % m0;
+ const unsigned int n0_leftover = n % n0;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+ build_opts.add_option_if(lhs->data_type() == DataType::F16, "-DHALF_PRECISION");
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+ build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+ build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+ build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+ build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+ build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+ std::string kernel_name("mat_mul_native_mmul");
+ kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+ kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(lhs->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(k);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const ICLTensor *lhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+ tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+ unsigned int idx = 0;
+
+ add_3d_tensor_nhw_argument(idx, lhs);
+ add_3d_tensor_nhw_argument(idx, rhs);
+ if (bias != nullptr)
+ {
+ add_3d_tensor_nhw_argument(idx, bias);
+ }
+ add_3d_tensor_nhw_argument(idx, dst);
+
+ // Pass m and n at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+ _kernel.setArg<cl_int>(idx++, _k);
+
+ // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+ // LWS also enforces the order of execution of the work items which improves cache utilization
+ enqueue(queue, *this, window, cl::NDRange(32, 2), false);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
new file mode 100644
index 0000000000..1aeb896325
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulNativeMMULKernel : public IClKernel
+{
+public:
+ ClMatMulNativeMMULKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeMMULKernel);
+ /** Initialize the kernel's input and output.
+ *
+ * This kernel performs matrix multiplication of lhs and rhs:
+ *
+ * dst = matmul(lhs, rhs)
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |lhs |rhs |dst |
+ * |:--------------|:--------------|:--------------|
+ * |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |
+ *
+ * Shape definitions:
+ * Dim0, Dim1, Dim2...
+ * lhs: [ K, M, Batch dims...]
+ * rhs: [ N, K, Batch dims...]
+ * dst: [ N, M, Batch dims...]
+ *
+ * Valid shape configurations:
+ * - K must be a multiple of 4 (MMUL_K0).
+ * - No broadcasting in batch dimensions. I.e. batch dims must be the same across lhs, rhs and dst
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor info for the LHS matrix.
+ * @param[in] rhs Input tensor info for the RHS matrix.
+ * @param[in] bias Bias tensor info. Can be nullptr. Data type supported: Same as @p lhs.
+ * @param[out] dst Output tensor info.
+ * @param[in] matmul_info Attributes for Batch MatMul Kernel
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMatMulNativeMMULKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int _m{1};
+ int _n{1};
+ int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL */
diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
new file mode 100644
index 0000000000..3b59c2a7fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMulKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(overflow_policy);
+ ARM_COMPUTE_UNUSED(rounding_policy);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+ DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+ DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured dst
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+ DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+ (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+ "Dst can only be U8 if both src are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->data_type() == DataType::QASYMM8 &&
+ (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+ "Dst can only be QASYMM8 if both src are QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->data_type() == DataType::QASYMM8_SIGNED &&
+ (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+ "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->data_type() == DataType::QSYMM16 &&
+ (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+ "Dst can only be QSYMM16 if both src are QSYMM16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+ (dst->data_type() != DataType::S32),
+ "Dst must be S32 if source tensors are S32");
+ if (in_place)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape,
+ src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+ "Wrong shape for dst, cannot do in_place calculation");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst");
+ }
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClMulKernel::ClMulKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClMulKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
+
+ auto padding_info = get_padding_info({src1, src2, dst});
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
+
+ int scale_int = -1;
+ // Extract sign, exponent and mantissa
+ int exponent = 0;
+ float normalized_mantissa = std::frexp(scale, &exponent);
+ // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+ // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+ // Moreover, it will be negative as we deal with 1/2^n
+ if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+ {
+ // Store the positive exponent. We know that we compute 1/2^n
+ // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+ scale_int = std::abs(exponent - 1);
+ }
+
+ std::string acc_type;
+ // Check if it has float src and dst
+ if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+ {
+ scale_int = -1;
+ acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
+ }
+ else
+ {
+ if (src1->element_size() == 4 || src2->element_size() == 4)
+ {
+ // use 64 bit accumulator for 32-bit input
+ acc_type = "long";
+ }
+ else if (src1->element_size() == 2 || src2->element_size() == 2)
+ {
+ // Use 32-bit accumulator for 16-bit input
+ acc_type = "int";
+ }
+ else
+ {
+ // Use 16-bit accumulator for 8-bit input
+ acc_type = "ushort";
+ }
+ }
+
+ const bool is_quantized = is_data_type_quantized(src1->data_type());
+ const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
+ const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
+
+ // Set kernel build options
+ std::string kernel_name = "pixelwise_mul";
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
+ build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+ ? "1"
+ : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+ ? "1"
+ : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+ if (is_quantized && (dst->data_type() != DataType::S32))
+ {
+ const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+ const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()),
+ "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
+ build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()),
+ "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
+ build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()),
+ "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ kernel_name += "_quantized";
+ }
+ else
+ {
+ kernel_name += (scale_int >= 0) ? "_int" : "_float";
+ build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+ "-DWRAP", "-DSATURATE");
+ build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
+ build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
+ if (act_info.enabled())
+ {
+ build_opts.add_option("-DACTIVATION_TYPE=" +
+ lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+ }
+ }
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+ build_opts.add_option_if(in_place, "-DIN_PLACE");
+ build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set scale argument
+ unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+
+ if (scale_int >= 0 && !is_quantized)
+ {
+ _kernel.setArg(idx++, scale_int);
+ }
+ else
+ {
+ _kernel.setArg(idx++, scale);
+ }
+
+ Window win = calculate_max_window(*dst, Steps(vec_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(dst->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+}
+
+Status ClMulKernel::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
+
+ return Status{};
+}
+
+void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src_0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src_1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
+ const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+ const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+ const TensorShape &out_shape = dst->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src_0 == dst) || (src_1 == dst);
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src_0, slice_input1);
+ add_3D_tensor_argument(idx, src_1, slice_input2);
+ if (!in_place)
+ {
+ add_3D_tensor_argument(idx, dst, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+namespace
+{
+constexpr unsigned int vec_size_complex = 1;
+
+Status validate_arguments_complex(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+ // Validate in case of configured dst
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst");
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClComplexMulKernel::ClComplexMulKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClComplexMulKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
+
+ auto padding_info = get_padding_info({src1, src2, dst});
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ if (act_info.enabled())
+ {
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+ }
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
+
+ Window win = calculate_max_window(*dst, Steps(vec_size_complex));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClComplexMulKernel::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
+
+ return Status{};
+}
+
+void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src_0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto src_1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+ const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+ const TensorShape &out_shape = dst->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src_0, slice_input1);
+ add_3D_tensor_argument(idx, src_1, slice_input2);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
+ ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
new file mode 100644
index 0000000000..76a3ce02c1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMulKernel.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H
+#define ARM_COMPUTE_CL_MUL_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pixelwise multiplication kernel.
+ *
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
+*/
+class ClMulKernel : public IClKernel
+{
+public:
+ ClMulKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel);
+ /** Initialise the kernel's src and dst.
+ *
+ * Valid configurations (Input1,Input2) -> Output :
+ *
+ * - (U8,U8) -> U8
+ * - (U8,U8) -> S16
+ * - (U8,S16) -> S16
+ * - (S16,U8) -> S16
+ * - (S16,S16) -> S16
+ * - (S32,S32) -> S32
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ * - (QASYMM8,QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (QSYMM16,QSYMM16) -> QSYMM16
+ * - (QSYMM16,QSYMM16) -> S32
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+ * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+ * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMulKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface for the complex pixelwise multiplication kernel. */
+class ClComplexMulKernel : public ICLKernel
+{
+public:
+ ClComplexMulKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel);
+ /** Initialise the kernel's src and dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2.
+ * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClComplexMulKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
new file mode 100644
index 0000000000..a4755782ed
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPermuteKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm)
+{
+ TensorShape dst_shape = src->tensor_shape();
+ permute(dst_shape, perm);
+ return dst_shape;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4,
+ "Permutation up to 4-D src tensor is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+ "Permutation vector size should be less than or equal to 4");
+ for (const auto &p : perm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+ }
+
+ // Validate configured dst
+ if (dst->total_size() != 0)
+ {
+ const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ }
+ return Status{};
+}
+} // namespace
+
+ClPermuteKernel::ClPermuteKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClPermuteKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ auto padding_info = get_padding_info({src, dst});
+ const TensorShape dst_shape = get_dst_shape(src, perm);
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
+
+ _perm = perm;
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+ build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
+ // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector
+ build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+ build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+ build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+ build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
+
+ _kernel = create_kernel(compile_context, "permute", build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps());
+
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
+
+ return Status{};
+}
+
+void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup dst slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, slice_in);
+ add_4D_tensor_argument(idx, dst, slice_out);
+ enqueue(queue, *this, slice_in, lws_hint());
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
new file mode 100644
index 0000000000..2413b10284
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H
+#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class ClPermuteKernel : public IClKernel
+{
+public:
+ ClPermuteKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel);
+ /** Set the src and dst of the kernel.
+ *
+ * @note Arbitrary permutation vectors are supported with rank not greater than 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info. Data types supported: All.
+ * @param[in] dst The dst tensor info. Data types supported: Same as @p src
+ * @param[in] perm Permutation vector
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PermutationVector &perm);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClPermuteKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ PermutationVector _perm{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PERMUTE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
new file mode 100644
index 0000000000..41ab4d6922
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPool2dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+ "Unsupported combination of parameters!");
+
+ const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const bool is_global_pooling = pool_info.is_global_pooling;
+ unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+ int output_width = 0;
+ int output_height = 0;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+ "Pooling region that is entirely outside input tensor is unsupported");
+
+ std::tie(output_width, output_height) =
+ scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+ pool_size_y, pool_info.pad_stride_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+ "Calculated output dimension size is invalid");
+
+ // Check indices
+ if (indices)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+ "Pooling indices only supported for pool size 2x2");
+
+ if (indices->total_size() != 0)
+ {
+ TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+ }
+ }
+
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClPool2dKernel::ClPool2dKernel()
+{
+ _type = CLKernelType::POOL;
+}
+
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ ITensorInfo *indices)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
+
+ auto padding_info = get_padding_info({src, dst, indices});
+
+ // Auto init if empty
+ TensorShape out_shape = compute_pool_shape(*src, pool_info);
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+ if (indices)
+ {
+ auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+ }
+
+ // Set instance variables
+ _pool_info = pool_info;
+ _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+ _num_elems_processed_per_iteration =
+ (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+ _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
+
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ const PoolingType pool_type = pool_info.pool_type;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+ const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+ const bool exclude_padding = pool_info.exclude_padding;
+ std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+ const int pool_pad_top = pad_stride_info.pad_top();
+ const int pool_pad_left = pad_stride_info.pad_left();
+ const DataType data_type = src->data_type();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+ build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+ build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+ build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+ build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+ build_opts.add_option("-DMAX_WIDTH=" +
+ support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+ build_opts.add_option("-DMAX_HEIGHT=" +
+ support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+
+ // Tensor paddings are used to calculate the indicies for MAX pooling
+ if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+ is_data_type_float(data_type))
+ {
+ build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
+ }
+
+ if (is_data_type_quantized_asymmetric(data_type))
+ {
+ build_opts.add_option("-DQUANTIZED");
+
+ if (src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+ }
+
+ // Set the initial value for the pooling operation accordingly with the data type
+ if (pool_type == PoolingType::MAX)
+ {
+ if (is_data_type_quantized(data_type))
+ {
+ PixelValue type_min{};
+ std::tie(type_min, std::ignore) = get_min_max(data_type);
+ build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+ }
+ else
+ {
+ std::string initial_value = pool_info.use_inf_as_limit
+ ? "(-INFINITY)"
+ : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+ build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
+ }
+ }
+ else
+ {
+ // Pool AVG and Pool L2 initial value
+ build_opts.add_option("-DINITIAL_VALUE=0");
+ }
+
+ // Create kernel
+ switch (_data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+ const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+ const auto acc_data_type = get_cl_type_from_data_type(
+ use_wider_accumulator ? DataType::F32
+ : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+ build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+ build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
+ if (pool_type != PoolingType::MAX)
+ {
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ }
+
+ if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+ is_data_type_float(data_type))
+ {
+ // For max pooling with pool2x2, store indicies which will be used in max unpooling
+ std::string kernel_name = "pooling_layer_2_nchw_indices";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ }
+ else // Run general case
+ {
+ std::string kernel_name = "pooling_layer_MxN_nchw";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ }
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ // Floating point mixed precision is support on F16 only
+ const auto use_fp_mixed_precision =
+ (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+ // Wider accumulation is required to avoid accuracy loss
+ // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
+ // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
+ DataType acc_data_type = data_type;
+
+ if (use_fp_mixed_precision)
+ {
+ acc_data_type = DataType::F32;
+ }
+ else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+ {
+ acc_data_type = DataType::S32;
+ }
+
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+ build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+ build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
+ build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+ if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+ {
+ build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+ std::string kernel_name = "pooling_layer_2x2_nhwc";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ }
+ else
+ {
+ std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+ ? "pooling_layer_MxN_quantized_nhwc"
+ : "pooling_layer_MxN_nhwc";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "pooling_layer_";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(_data_layout));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_width));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_height));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(src->data_layout()));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPool2dKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
+ return Status{};
+}
+
+void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ unsigned int pool_stride_x = 0;
+ unsigned int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+ auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ switch (_data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ Window slice = window_collapsed.first_slice_window_3D();
+ do
+ {
+ // Set srcs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+ {
+ add_3D_tensor_argument(idx, indices, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window_collapsed.slide_window_slice_3D(slice));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
+
+ Window slice = window_collapsed.first_slice_window_4D();
+ Window in_slice = window_collapsed.first_slice_window_4D();
+ in_slice.set(Window::DimX,
+ Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+ in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
+ in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
+ in_slice.set(3, Window::Dimension(0, batch_size, 1));
+ do
+ {
+ // Set srcs
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, in_slice);
+ add_4D_tensor_argument(idx, dst, slice);
+ if (indices && is_data_type_float(src->info()->data_type()) &&
+ (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+ {
+ add_4D_tensor_argument(idx, indices, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
new file mode 100644
index 0000000000..56b95a37d5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class ClPool2dKernel : public IClKernel
+{
+public:
+ ClPool2dKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
+
+ /** Configure kernel for a given list of arguments
+ *
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ ITensorInfo *indices = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClPool2dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ PoolingLayerInfo _pool_info{};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ unsigned int _num_elems_processed_per_iteration{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
new file mode 100644
index 0000000000..a08c5d4be7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPool3dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+ "Strides cannot be zero.");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+ (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
+ "Exclude padding is unsupported for non-float types for Avg op");
+
+ const auto data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+ const bool is_global_pooling = pool_info.is_global_pooling;
+ const unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ const unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+ const unsigned int pool_size_z = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+ int output_width = 0;
+ int output_height = 0;
+ int output_depth = 0;
+
+ bool round_type_ceil_with_asymm_padding =
+ (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+ "Cannot use dimension round type CEIL when padding is asymmetric.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+ "Pooling region that is entirely outside input tensor is unsupported");
+ std::tie(output_width, output_height, output_depth) =
+ scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+ src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+ "Calculated output dimension size is invalid");
+ // Checks performed when dst is configured
+ if (dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClPool3dKernel::ClPool3dKernel()
+{
+ _type = CLKernelType::POOL;
+}
+
+void ClPool3dKernel::configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+ auto padding_info = get_padding_info({src, dst});
+
+ // Auto init if empty
+ TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+
+ // Set instance variables
+ _pool_info = pool_info;
+ _data_layout = src->data_layout();
+
+ _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
+ _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
+
+ const PoolingType pool_type = pool_info.pool_type;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_depth = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+ const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+ const int pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+ const bool exclude_padding = pool_info.exclude_padding;
+ const int pool_stride_x = pool_info.stride.x();
+ const int pool_stride_y = pool_info.stride.y();
+ const int pool_stride_z = pool_info.stride.z();
+ const int pool_pad_top = pool_info.padding.top;
+ const int pool_pad_left = pool_info.padding.left;
+ const int pool_pad_front = pool_info.padding.front;
+ const DataType data_type = src->data_type();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+ build_opts.add_option("-DSTRIDE_Z=" + support::cpp11::to_string(pool_stride_z));
+ build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+ build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+ build_opts.add_option("-DPAD_Z=" + support::cpp11::to_string(pool_pad_front));
+ build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+ build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+ build_opts.add_option("-DPOOL_SIZE_Z=" + support::cpp11::to_string(pool_size_z));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
+
+ // If datatype is quantized add relevant parameters
+ if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+
+ // Set the initial value for the pooling operation accordingly with the data type
+ if (pool_type == PoolingType::MAX)
+ {
+ if (is_data_type_quantized(data_type))
+ {
+ PixelValue type_min{};
+ std::tie(type_min, std::ignore) = get_min_max(data_type);
+ build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+ }
+ else
+ {
+ build_opts.add_option("-DINITIAL_VALUE=" +
+ float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+ }
+ }
+ else
+ {
+ // Pool AVG and Pool L2 initial value
+ build_opts.add_option("-DINITIAL_VALUE=0");
+ }
+ // Create kernel
+ // Floating point mixed precision is support on F16 only
+ const auto use_fp_mixed_precision =
+ (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+ // Wider accumulation is required to avoid accuracy loss
+ // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
+ DataType acc_data_type = data_type;
+ if (use_fp_mixed_precision)
+ {
+ acc_data_type = DataType::F32;
+ }
+ else if (is_data_type_quantized(data_type) &&
+ pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+ {
+ acc_data_type = DataType::S32;
+ }
+
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+ build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+ build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
+ build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
+ build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+
+ // if datatype is quantized use quantized kernel function
+ std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+ : "pooling_3d_layer_MxN_ndhwc");
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "pooling_layer_3d";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(_data_layout));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_width));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_height));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(src->data_layout()));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+ return Status{};
+}
+
+void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+
+ // Collapse 3D window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ // Set CL kernel arguments
+ unsigned int idx = 0;
+ // Passing of the window not needed, as the steps are not used for the pool3d kernel
+ add_5D_tensor_argument(idx, src, window);
+ add_5D_tensor_argument(idx, dst, window);
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
new file mode 100644
index 0000000000..6cd229c427
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL3D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class ClPool3dKernel : public IClKernel
+{
+public:
+ ClPool3dKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool3dKernel);
+
+ /** Configure kernel for a given list of arguments
+ *
+ * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+ */
+ void configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src,
+ ITensorInfo *dst,
+ const Pooling3dLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClPool3dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ Pooling3dLayerInfo _pool_info{};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ unsigned int _num_elems_processed_per_iteration{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL3D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
new file mode 100644
index 0000000000..e8df420f67
--- /dev/null
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+ // Output must always be initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QASYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+
+ return Status{};
+}
+} // namespace
+
+ClQuantizeKernel::ClQuantizeKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ auto padding_info = get_padding_info({src, dst});
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+ const int vec_size_x = 16 / src->element_size();
+ const int input_width_x = src->tensor_shape().x();
+ const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+ const UniformQuantizationInfo qinfo = dst->quantization_info().uniform();
+ const DataType output_data_type = dst->data_type();
+
+ float scale_to_apply = qinfo.scale;
+ int32_t offset_to_apply = qinfo.offset;
+ if (is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ /*
+ * In case of requantization of a quantized input tensor to an output tensor with another quantization
+ * instead of of apply dequantization and then a quantization functions, we just compute new scale and
+ * offset to apply.
+ *
+ * Assuming:
+ * - q_i as input quantized value
+ * - q_o as output quantized value
+ * - z_i as input quantization offset value
+ * - z_o as output quantization offset value
+ * - s_i as input quantization scale value
+ * - s_o as output quantization scale value
+ * - z_n as new quantization offset value
+ * - s_n as new quantization scale value
+ *
+ * q_o = ( q_i - z_i ) * s_i / s_o + z_o
+ *
+ * We can rewrite the formula as:
+ *
+ * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
+ *
+ * q_o = q_i / s_n + z_n
+ *
+ * Where:
+ *
+ * s_n = s_o / s_i
+ *
+ * z_n = - z_i * s_i / s_o + z_o
+ *
+ */
+ const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
+ scale_to_apply /= qinfo_in.scale;
+ // In order to minimize flooring we convert the offset to a float,
+ // then compute the new offset in the float domain,
+ // finally we convert it back as int32_t
+ offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
+ }
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
+ build_opts.add_option_if(
+ multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+ std::pair<int, int> min_max_quant_values =
+ quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+ build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
+ build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
+
+ _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps());
+ if (multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+ return Status{};
+}
+
+void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window_collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.h b/src/gpu/cl/kernels/ClQuantizeKernel.h
new file mode 100644
index 0000000000..aeab28febe
--- /dev/null
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors.
+ */
+class ClQuantizeKernel : public IClKernel
+{
+public:
+ ClQuantizeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
+ /** Set the input, output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+ * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+ *
+ * @note Output auto initialization is not supported by this kernel
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClQuantizeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
new file mode 100644
index 0000000000..53889f3a6b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include <string>
+
+/** [ClReshapeKernel Kernel] **/
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ if (dst->tensor_shape().total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClReshapeKernel::ClReshapeKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ // Create kernel
+ std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
+ _kernel = create_kernel(compile_context, "reshape_layer", build_opts);
+
+ // Add static arguments
+ const cl_int2 src_shape = {
+ {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+ const cl_int2 dst_shape = {
+ {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+ _kernel.setArg<cl_int2>(idx++, src_shape);
+ _kernel.setArg<cl_int2>(idx++, dst_shape);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst);
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+
+ return Status{};
+}
+
+void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Set srcs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, window_collapsed);
+ add_3D_tensor_argument(idx, dst, window_collapsed);
+ enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+/** [ClReshapeKernel Kernel] **/
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
new file mode 100644
index 0000000000..95eae82086
--- /dev/null
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H
+#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the kernel to perform tensor reshaping */
+class ClReshapeKernel : public IClKernel
+{
+public:
+ ClReshapeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel);
+ /** Set the src and dst of the kernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data type supported: All.
+ * @param[out] dst Destination tensor info. Data type supported: Same as @p src
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClReshapeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
new file mode 100644
index 0000000000..4305acad26
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClScaleKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+{
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the ratio between source width/height and destination width/height
+ const unsigned int src_width = src->dimension(idx_width);
+ const unsigned int src_height = src->dimension(idx_height);
+ const unsigned int dst_width = dst->dimension(idx_width);
+ const unsigned int dst_height = dst->dimension(idx_height);
+
+ float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
+ float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
+
+ return std::make_tuple(scale_x, scale_y);
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ info.align_corners &&
+ !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+ !is_data_type_quantized_asymmetric(src->data_type()));
+
+ float scale_x = 0.f;
+ float scale_y = 0.f;
+ const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+ std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+ (scale_x > 1.f || scale_y > 1.f));
+
+ return Status{};
+}
+} // namespace
+
+Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
+ return Status{};
+}
+
+ClScaleKernel::ClScaleKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
+ auto padding_info = get_padding_info({src, dst});
+
+ // Info required for the static tuning
+ _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+
+ const bool is_nhwc = _data_layout == DataLayout::NHWC;
+
+ float scale_x = 0.f;
+ float scale_y = 0.f;
+ std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
+
+ // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+ auto interpolation_policy_to_use = info.interpolation_policy;
+ if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+ {
+ interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
+ }
+
+ // Create kernel
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int src_width = src->dimension(idx_width);
+ const unsigned int src_height = src->dimension(idx_height);
+ const unsigned int dst_width = dst->dimension(idx_width);
+ const unsigned int dst_channels = dst->dimension(idx_channel);
+ unsigned int vec_size = 0;
+ unsigned int vec_size_leftover = 0;
+
+ CLBuildOptions build_opts;
+ if (_data_layout == DataLayout::NHWC)
+ {
+ vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
+ vec_size_leftover = dst_channels % vec_size;
+ build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option("-DCONSTANT_VALUE=" +
+ string_from_pixel_value(info.constant_border_value, src->data_type()));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
+ build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+ build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+ build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+ build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
+ build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+ "-DSAMPLING_POLICY_TOP_LEFT");
+ }
+ else if (_data_layout == DataLayout::NCHW)
+ {
+ vec_size = adjust_vec_size(4, dst_width);
+ vec_size_leftover = dst_width % vec_size;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DCONSTANT_VALUE=" +
+ string_from_pixel_value(info.constant_border_value, src->data_type()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+ build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+ build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+ ? support::cpp11::to_string(vec_size)
+ : support::cpp11::to_string(vec_size_leftover)));
+ build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+ build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+ build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+ "-DSAMPLING_POLICY_TOP_LEFT");
+
+ const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+ info.interpolation_policy == InterpolationPolicy::BILINEAR;
+ if (is_qasymm_bilinear)
+ {
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+ build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON("Unsupported data layout");
+ }
+
+ std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
+ std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+ std::string kernel_name = "scale_" + interpolation_name + "_";
+ kernel_name += lower_string(string_from_data_layout(_data_layout));
+
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(vec_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+ // Pass scale kernel arguments
+ if (is_nhwc)
+ {
+ unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
+ _kernel.setArg<cl_float>(idx++, scale_x);
+ _kernel.setArg<cl_float>(idx++, scale_y);
+ }
+ // Set config_id for enabling LWS tuning
+ _config_id = "scale_";
+ _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
+ _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
+ _config_id += (is_nhwc ? "nhwc" : "nchw");
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(3));
+}
+
+void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ switch (_data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, src, slice);
+ add_2D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_2D(slice));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_4D();
+
+ unsigned int idx = 0;
+ add_4d_tensor_nhwc_argument(idx, src);
+ add_4d_tensor_nhwc_argument(idx, dst);
+ enqueue(queue, *this, slice, lws_hint());
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
new file mode 100644
index 0000000000..c09659017d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScaleKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H
+#define ARM_COMPUTE_CL_SCALE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the scale kernel */
+class ClScaleKernel : public IClKernel
+{
+public:
+ ClScaleKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel);
+ /** Initialise the kernel's inputs, output and interpolation policy
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+ * @param[out] dst Destination tensor info. Data types supported: Same as @p src
+ * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure.
+ */
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClScaleKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScatterKernel.cpp b/src/gpu/cl/kernels/ClScatterKernel.cpp
new file mode 100644
index 0000000000..19adc1ef34
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScatterKernel.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClScatterKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+namespace
+{
+constexpr int max_index_length = 5;
+} // namespace
+
+ClScatterKernel::ClScatterKernel()
+{
+}
+
+Status ClScatterKernel::validate(const ITensorInfo *updates,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const ScatterInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+
+ const TensorShape &ind_shape = indices->tensor_shape();
+ const TensorShape &upt_shape = updates->tensor_shape();
+ const TensorShape &dst_shape = dst->tensor_shape();
+
+ const int32_t upt_dims = upt_shape.num_dimensions();
+ const int32_t dst_dims = dst_shape.num_dimensions();
+ const int32_t ind_dims = ind_shape.num_dimensions();
+ const int32_t data_dim = upt_dims - (ind_dims - 1); // Number of batch dims is the number of indices dims - 1
+
+ const int32_t index_len = ind_shape[0];
+ bool unsupported_padding_config =
+ (dst_dims == index_len) && index_len > 1 && (dst->has_padding() || updates->has_padding());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(unsupported_padding_config, "Padding is not supported with these shapes.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(updates, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32, DataType::F16, DataType::S32, DataType::S16,
+ DataType::S8, DataType::U32, DataType::U16, DataType::U8);
+
+ // Check data dims in update tensor and output tensor are equal
+ for (int32_t i = 0; i < data_dim; i++)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(upt_shape[i] != dst_shape[i],
+ "Data dims should be same size in both updates and ouput tensor.");
+ }
+
+ // Check if batch dims in indices and updates tensor are equal.
+ for (int32_t i = 0; i < ind_dims - 1; i++)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(upt_shape[data_dim + i] != ind_shape[i + 1],
+ "Batch dimensions should be the same in updates and indices tensor.");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(ind_shape[1] != upt_shape[data_dim],
+ "Height of indices tensor should match size of highest dimension in updates tensor "
+ "(Excluding batch dimension)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ data_dim >= dst_dims, "Update tensor cannot have more dims than output tensor. (Excluding batch dimensions)");
+ ARM_COMPUTE_RETURN_ERROR_ON(index_len != dst_dims - data_dim);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((ind_dims < 2), "Shape of Indices tensor must be at least 2D");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(index_len > max_index_length, "Maximum supported index length is 5!");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(index_len > dst_dims && dst_dims != 1,
+ "Index length should be smaller than or equal to number of output dims");
+
+ return Status{};
+}
+
+void ClScatterKernel::configure(const ClCompileContext &compile_context,
+ const ITensorInfo *updates,
+ const ITensorInfo *indices,
+ ITensorInfo *dst,
+ const ScatterInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(updates, dst, indices);
+ ARM_COMPUTE_LOG_PARAMS(updates, indices, dst, info);
+
+ const TensorShape &dst_shape = dst->tensor_shape();
+ const int index_len = indices->dimension(0);
+
+ // Check for single element data block
+ const bool is_scalar_block = (dst->num_dimensions() == static_cast<uint32_t>(index_len));
+
+ const int n0 = adjust_vec_size(16 / updates->element_size(), is_scalar_block ? 1 : updates->dimension(0));
+ const int partial_n0 = updates->dimension(0) % n0;
+
+ // The GWS will be 2D [x, y]
+ // x-dimension refers to the x coordinate of the dst tensor
+ // y-dimension refers to the collapsed y-coordinate of the data part of the dst tensor
+ Window win;
+
+ if (!is_scalar_block)
+ {
+ win = calculate_max_window(dst_shape, Steps(n0));
+
+ // Collapse the dimensions corresponding to indices in the execution window
+ for (int i = 0; i < index_len; ++i)
+ {
+ win.set(dst->num_dimensions() - (i + 1), Window::Dimension(0, 1, 1));
+ }
+
+ win = win.collapse(win, 1);
+ }
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option_if(is_data_type_float(dst->data_type()), "-DIS_FLOAT");
+
+ const int num_dims = dst->num_dimensions();
+ TensorShape ind_collapsed = indices->tensor_shape().collapsed_from(1);
+ build_opts.add_option("-DNUM_INDICES=" + support::cpp11::to_string(ind_collapsed[1]));
+ build_opts.add_option("-DINDEX_LENGTH=" + support::cpp11::to_string(index_len));
+
+ // We provide 5 variables to use in a constant array
+ for (int i = 1; i <= max_index_length; i++)
+ {
+ build_opts.add_option("-DOUT_SHAPE_N_MINUS_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(dst_shape[std::max(num_dims - i, 0)]));
+ }
+
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_n0));
+
+ switch (info.func)
+ {
+ case ScatterFunction::Update:
+ build_opts.add_option("-DSCATTER_FUNCTION=UPDATE_OP");
+ build_opts.add_option("-DSKIP_OUTPUT_READ");
+ break;
+ case ScatterFunction::Add:
+ build_opts.add_option("-DSCATTER_FUNCTION=ADD_OP");
+ break;
+ case ScatterFunction::Sub:
+ build_opts.add_option("-DSCATTER_FUNCTION=SUB_OP");
+ break;
+ case ScatterFunction::Max:
+ build_opts.add_option("-DSCATTER_FUNCTION=MAX_OP");
+ break;
+ case ScatterFunction::Min:
+ build_opts.add_option("-DSCATTER_FUNCTION=MIN_OP");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Create kernel
+ std::string kernel_name = "scatter_mp1d_2d_mpnd";
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ ICLKernel::configure_internal(win);
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(updates->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(2));
+ _config_id += "_";
+}
+
+void ClScatterKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ const auto updates =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto indices =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ const ITensorInfo *dst_info = dst->info();
+ const ITensorInfo *upd_info = updates->info();
+ const int num_dims = dst_info->num_dimensions();
+ const int ind_dims = indices->info()->num_dimensions();
+ const int index_len = indices->info()->dimension(0);
+
+ bool unsupported_padding_config =
+ num_dims == index_len && index_len > 1 && (dst_info->has_padding() || upd_info->has_padding());
+ if (unsupported_padding_config)
+ {
+ ARM_COMPUTE_ERROR("Unsupported Configuration! Padding not supported with these shapes.");
+ }
+
+ // calculate m-dimensional data block strides in updates and destination tensors
+ const int upt_block_stride =
+ updates->info()->strides_in_bytes()[updates->info()->num_dimensions() - (ind_dims - 1)];
+
+ const int out_block_stride = dst_info->strides_in_bytes()[num_dims - index_len];
+
+ unsigned int idx = 0;
+
+ add_2D_tensor_argument(idx, updates, window);
+ add_2D_tensor_argument(idx, indices, window);
+ add_2D_tensor_argument(idx, dst, window);
+
+ _kernel.setArg<cl_int>(idx++, upt_block_stride);
+ _kernel.setArg<cl_int>(idx++, out_block_stride);
+
+ enqueue(queue, *this, window, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClScatterKernel.h b/src/gpu/cl/kernels/ClScatterKernel.h
new file mode 100644
index 0000000000..e1b469c88e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScatterKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
+
+#include "arm_compute/function_info/ScatterInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+class ClScatterKernel : public IClKernel
+{
+public:
+ ClScatterKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScatterKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @note Negative indices are treated as out of bounds.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] updates Input tensor info for the Update matrix. Data type supported: same as @p src
+ * @param[in] indices Input tensor info for the Indices matrix. Data type supported: S32.
+ * @param[out] dst Output tensor info. Data type supported: same as @p src
+ * @param[in] info Attributes for Scatter Kernel
+ */
+ void configure(const ClCompileContext &compile_context,
+ const ITensorInfo *updates,
+ const ITensorInfo *indices,
+ ITensorInfo *dst,
+ const ScatterInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClScatterKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *updates, const ITensorInfo *indices, const ITensorInfo *dst, const ScatterInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
new file mode 100644
index 0000000000..796345a923
--- /dev/null
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+ClSoftmaxKernel::ClSoftmaxKernel()
+{
+}
+
+Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(src, dst, info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
+ &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
+ ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+ static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+
+ if (is_data_type_quantized_asymmetric(src.data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
+ get_softmax_output_quantization_info(src.data_type(), info.is_log));
+ }
+
+ return Status{};
+}
+
+void ClSoftmaxKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo &src,
+ ITensorInfo &dst,
+ const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
+
+ const auto &dst_shape = dst.tensor_shape();
+
+ const auto data_type = src.data_type();
+ const auto element_size = src.element_size();
+
+ const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
+ const auto src_qinfo = src.quantization_info().uniform();
+ const auto dst_qinfo = dst.quantization_info().uniform();
+
+ const auto axis = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
+ const auto length = dst_shape[axis];
+
+ const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
+
+ const auto vec_size = adjust_vec_size(16 / element_size, dst_shape[0]);
+ const auto vec_size_leftover = dst_shape[0] % vec_size;
+
+ std::string kernel_name("softmax");
+ CLBuildOptions build_opts;
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
+ build_opts.add_option_if(info.is_log, "-DIS_LOG");
+ build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
+
+ build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
+ build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
+ build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
+ build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
+ build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+
+ if (axis == 0)
+ {
+ kernel_name += "_x";
+ build_opts.add_option("-DSOFTMAX_X");
+
+ if (is_quantized)
+ {
+ _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
+ }
+ }
+ else
+ {
+ kernel_name += "_non_x";
+ build_opts.add_option("-DSOFTMAX_NON_X");
+
+ TensorShape tmp_shape;
+
+ tmp_shape.set(0, length * vec_size, false);
+ tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
+
+ for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
+ {
+ tmp_shape.set(i, dst_shape[i - 1], false);
+ }
+
+ for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
+ {
+ tmp_shape.set(i, dst_shape[i], false);
+ }
+
+ _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
+ }
+
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window and kernel arguments.
+ Window win = calculate_max_window(src, Steps(vec_size));
+
+ bool has_collapsed = true;
+
+ win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
+ win = win.collapse_if_possible(win, 2, has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ ICLKernel::configure_internal(win);
+
+ _axis = axis;
+
+ _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
+ _config_id += "_" + std::to_string(axis);
+ _config_id += "_" + std::to_string(length);
+}
+
+void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ICLTensor *tmp = (_tmp_info.total_size() > 0)
+ ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
+ : nullptr;
+
+ if (!_prepared)
+ {
+ _prepared = true;
+
+ const auto *src_info = src->info();
+ const auto *dst_info = dst->info();
+ auto src_strides = src_info->strides_in_bytes();
+ auto dst_strides = dst_info->strides_in_bytes();
+
+ const auto src_stride_axis = src_strides[_axis];
+ const auto dst_stride_axis = dst_strides[_axis];
+
+ // This axis has been removed from execution window, hence we remove it from the list of strides
+ // provided to the kernel.
+ // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
+ src_strides.remove(_axis);
+ dst_strides.remove(_axis);
+
+ // Argument 0: src_ptr.
+ _kernel.setArg<cl_uint>(1, src_strides[0]);
+ _kernel.setArg<cl_uint>(2, src_strides[1]);
+ _kernel.setArg<cl_uint>(3, src_strides[2]);
+ _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
+
+ // Argument 5: dst_ptr.
+ _kernel.setArg<cl_uint>(6, dst_strides[0]);
+ _kernel.setArg<cl_uint>(7, dst_strides[1]);
+ _kernel.setArg<cl_uint>(8, dst_strides[2]);
+ _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
+
+ if (tmp != nullptr)
+ {
+ const auto *tmp_info = tmp->info();
+ const auto &tmp_strides = tmp_info->strides_in_bytes();
+
+ // Argument 10: tmp_ptr.
+ _kernel.setArg<cl_uint>(11, tmp_strides[1]);
+ _kernel.setArg<cl_uint>(12, tmp_strides[2]);
+ _kernel.setArg<cl_uint>(13, tmp_strides[3]);
+ _kernel.setArg<cl_uint>(14, 0);
+ }
+
+ if (_axis > 0)
+ {
+ _kernel.setArg<cl_uint>(15, src_stride_axis);
+ _kernel.setArg<cl_uint>(16, dst_stride_axis);
+ }
+ }
+
+ _kernel.setArg(0, src->cl_buffer());
+ _kernel.setArg(5, dst->cl_buffer());
+
+ if (tmp != nullptr)
+ {
+ _kernel.setArg(10, tmp->cl_buffer());
+ }
+
+ enqueue(queue, *this, window, lws_hint());
+}
+
+const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
+{
+ return _tmp_info;
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
new file mode 100644
index 0000000000..130dc7835c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+/** The CL kernel that performs softmax function. */
+class ClSoftmaxKernel : public IClKernel
+{
+public:
+ ClSoftmaxKernel();
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSoftmaxKernel);
+
+ /** Check if the kernel arguments are valid.
+ *
+ * See @ref ClSoftmaxKernel::configure().
+ *
+ * @return The status.
+ */
+ static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+
+ /** Configure the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src
+ * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo &src,
+ ITensorInfo &dst,
+ const SoftmaxKernelInfo &info);
+
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+ /** Get the tensor info of the temporary tensor. */
+ const TensorInfo &tmp_tensor_info() const;
+
+private:
+ bool _prepared{false};
+ int32_t _axis{0};
+ TensorInfo _tmp_info{};
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
new file mode 100644
index 0000000000..f95a215107
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClTransposeKernel::ClTransposeKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output auto initialization if not yet initialized
+ const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+ // Explicitly set the tensor shape to preserve dimensions
+ dst->set_tensor_shape(dst_shape);
+
+ ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
+ auto padding_info = get_padding_info({src, dst});
+
+ unsigned int vec_size_x;
+ unsigned int vec_size_y;
+
+ // Set the optimal tile size for each data type without register spilling
+ switch (src->element_size())
+ {
+ case 1:
+ vec_size_x = adjust_vec_size(8, src->dimension(0));
+ vec_size_y = adjust_vec_size(16, src->dimension(1));
+ break;
+ case 2:
+ vec_size_x = adjust_vec_size(8, src->dimension(0));
+ vec_size_y = adjust_vec_size(8, src->dimension(1));
+ break;
+ case 4:
+ vec_size_x = adjust_vec_size(4, src->dimension(0));
+ vec_size_y = adjust_vec_size(8, src->dimension(1));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ break;
+ }
+
+ const int vec_size_x_leftovers = src->dimension(0) % vec_size_x;
+ const int vec_size_y_leftovers = src->dimension(1) % vec_size_y;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size()));
+ build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
+ build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
+
+ _kernel = create_kernel(compile_context, "transpose", build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y));
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ // Validate configured dst
+ if (dst->total_size() != 0)
+ {
+ const TensorInfo dst_info =
+ src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ }
+
+ return Status{};
+}
+
+void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Collapse dimensions higher than width and height into the batch dimension
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h
new file mode 100644
index 0000000000..eaad38b20f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */
+class ClTransposeKernel : public IClKernel
+{
+public:
+ ClTransposeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel);
+ /** Set the src and dst of the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info. Data types supported: All.
+ * @param[in] dst The dst tensor info. Data types supported: Same as @p src
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClTransposeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
new file mode 100644
index 0000000000..76f39ac500
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
+
+ constexpr unsigned int channel_idx = 0;
+ constexpr unsigned int width_idx = 1;
+ constexpr unsigned int height_idx = 2;
+ constexpr unsigned int batch_idx = 3;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ if (biases != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+ }
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ const size_t input_width = input->dimension(width_idx);
+ const size_t input_height = input->dimension(height_idx);
+ const size_t weights_width = weights->dimension(width_idx);
+ const size_t weights_height = weights->dimension(height_idx);
+
+ auto out_dims =
+ deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+ TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(output, DataLayout::NHWC);
+ }
+
+ return Status{};
+}
+} // namespace
+
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_UNUSED(biases, deconv_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validation
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input, weights, biases, output, deconv_info));
+
+ constexpr unsigned int channel_idx = 0;
+ constexpr unsigned int width_idx = 1;
+ constexpr unsigned int height_idx = 2;
+
+ const size_t input_channels = input->dimension(channel_idx); // same as weight channels
+ const size_t input_width = input->dimension(width_idx);
+ const size_t input_height = input->dimension(height_idx);
+ const size_t weights_width = weights->dimension(width_idx);
+ const size_t weights_height = weights->dimension(height_idx);
+ const size_t output_width = output->dimension(width_idx);
+ const size_t output_height = output->dimension(height_idx);
+ const size_t output_channels = output->dimension(channel_idx);
+
+ // Calculate output shape
+ auto out_dims =
+ deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+ TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
+
+ // Calculate updated paddings
+ // p' = k - p - 1 (k: kernel dimensions)
+ const uint32_t pad_left = weights_width - deconv_info.pad_left() - 1;
+ const uint32_t pad_top = weights_height - deconv_info.pad_top() - 1;
+
+ // Configure kernel window
+ Window win;
+ output_shape.collapse(2U, 1U); // Collapse width and height into single dimension
+
+ const unsigned int n0 = adjust_vec_size(16 / output->element_size(), output_channels);
+ const unsigned int m0 = 1;
+ const unsigned int k0 = adjust_vec_size(16 / input->element_size(), input_channels);
+ const unsigned int partial_store_n0 = output_channels % n0;
+
+ // Create window and update padding
+ win = calculate_max_window(output_shape, Steps(n0, m0));
+ ICLKernel::configure_internal(win);
+
+ const std::string kernel_name = "transposed_convolution_nhwc";
+ CLBuildOptions build_options;
+
+ const DataType input_data_type = input->data_type();
+ const PaddingInfo strides = deconv_info.stride();
+
+ if (biases != nullptr)
+ {
+ build_options.add_option(std::string("-DHAS_BIAS"));
+ build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+ }
+
+ const auto output_data_type = output->data_type();
+
+ build_options.add_option("-cl-fast-relaxed-math");
+ build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type));
+ build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(input_channels));
+ build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+ build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+ build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output_channels));
+ build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(output_width));
+ build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output_height));
+ build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(output_data_type));
+ build_options.add_option("-DWEI_TENSOR_TYPE=BUFFER");
+ build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights_width));
+ build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights_height));
+ build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
+ build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(strides.first));
+ build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(strides.second));
+ build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+ build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+ build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+ build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
+
+ if (is_data_type_quantized(output_data_type))
+ {
+ const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+
+ PixelValue zero_value = PixelValue(0, input->data_type(), input->quantization_info());
+ int zero_value_s32;
+ zero_value.get(zero_value_s32);
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_options.add_option("-DIS_QUANTIZED");
+ build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+ }
+ else
+ {
+ build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type));
+ build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ if (compile_context.get_ddk_version() >= 30)
+ {
+ build_options.add_option("-fregister-allocation=64");
+ }
+
+ _kernel = create_kernel(compile_context, kernel_name, build_options.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input_data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(weights_width);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(strides.first);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(strides.second);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output_width);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(n0);
+}
+
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
+ return Status{};
+}
+
+void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const auto weights =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const auto biases =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ unsigned int idx = 0;
+ add_4d_tensor_nhwc_argument(idx, src);
+ add_4d_tensor_nhwc_argument(idx, dst);
+
+ add_4d_tensor_nhwc_argument(idx, weights);
+ if (biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, biases, slice);
+ }
+
+ enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
new file mode 100644
index 0000000000..44f6f56b7a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H
+#define ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel for transposed convolution. */
+class ClTransposedConvolutionKernel : public IClKernel
+{
+public:
+ ClTransposedConvolutionKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposedConvolutionKernel);
+ /** Set the input, weights, biases and output tensors.
+ *
+ * Similar to @ref ClTransposedConvolution::configure()
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClTransposedConvolution::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &deconv_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..af80c4d796
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
+
+ if (biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+ (biases->dimension(0) != input->tensor_shape()[3]));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ (input->num_dimensions() == 5) &&
+ (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+ }
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClWeightsReshapeKernel::ClWeightsReshapeKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(
+ *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
+ auto padding_info = get_padding_info({src, biases, dst});
+
+ const DataType data_type = src->data_type();
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
+ build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+ build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
+
+ // Configure window
+ Window win = calculate_max_window(*src, Steps());
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
+ return Status{};
+}
+
+void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window out_window;
+ out_window.use_tensor_dimensions(dst->info()->tensor_shape());
+
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ Window biases_window;
+ Window biases_slice;
+
+ unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+ idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+ _kernel.setArg<cl_uint>(idx++, src->info()->dimension(0));
+ _kernel.setArg<cl_uint>(idx++, src->info()->dimension(1));
+ _kernel.setArg<cl_uint>(idx++, src->info()->dimension(2));
+ _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
+ _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
+
+ if (biases != nullptr)
+ {
+ biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
+ biases_slice = biases_window.first_slice_window_1D();
+ }
+
+ do
+ {
+ // Set arguments
+ unsigned idx = 0;
+ add_3D_tensor_argument(idx, src, in_slice);
+ add_2D_tensor_argument(idx, dst, out_slice);
+ if (biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, biases, biases_slice);
+ ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
+ }
+
+ // Run kernel
+ enqueue(queue, *this, in_slice, lws_hint());
+ } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
new file mode 100644
index 0000000000..5e05f8d006
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
+#define ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref opencl::kernels::ClIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClWeightsReshapeKernel : public IClKernel
+{
+public:
+ ClWeightsReshapeKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWeightsReshapeKernel);
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All
+ * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+ * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
+ * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+ * @param[out] dst The output tensor info. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
+ * Data types supported: Same as @p input
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+ * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
+ */
+ void configure(const ClCompileContext &compile_context,
+ const ITensorInfo *src,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClWeightsReshapeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000000..15195025ce
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
+
+ for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
+ return Status{};
+}
+
+ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
+
+ auto padding_info = get_padding_info({src1, src2, dst});
+
+ const unsigned int min_dimension = std::min(src1->dimension(0), src2->dimension(0));
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+ const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+ build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+ num_elems_processed_per_iteration));
+
+ // If input have different quantization info set quantization parameters needed for the re-quantization process
+ const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
+ if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+ {
+ const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+ const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+ build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+
+ _depth = src1->dimension(2);
+ _input1_width = src1->dimension(0);
+
+ std::string kernel_name = "concatenate_width_x2";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x2_";
+ _config_id += lower_string(string_from_data_type(src1->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(1));
+}
+
+void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_4D();
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src0, slice);
+ add_4D_tensor_argument(idx, src1, slice);
+ add_4D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_int>(idx++, _depth);
+ _kernel.setArg<cl_int>(idx++, _input1_width);
+ enqueue(queue, *this, window, lws_hint());
+ } while (window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000000..8b53d6d66b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 2 tensors.
+ * The src1 and src2 tensors will be concatenated into the dst tensor.
+ */
+class ClWidthConcatenate2TensorsKernel : public IClKernel
+{
+public:
+ ClWidthConcatenate2TensorsKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
+ /** Initialise the kernel's sources and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: All.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1
+ * @param[out] dst Destination tensor info. Data types supported: Same as @p src1.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClWidthConcatenate2TensorsKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ int32_t _depth{0};
+ int32_t _input1_width{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000000..c4f84e3e45
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *src3,
+ const ITensorInfo *src4,
+ const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+ dst->dimension(0));
+
+ for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *src3,
+ const ITensorInfo *src4,
+ const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
+ return Status{};
+}
+
+void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *src3,
+ ITensorInfo *src4,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
+
+ auto padding_info = get_padding_info({src1, src2, src3, src4, dst});
+ const unsigned int min_dimension =
+ std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+ const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+ build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+ num_elems_processed_per_iteration));
+ build_opts.add_option("-DINPUT2_ROTATE_N=" +
+ support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+ num_elems_processed_per_iteration));
+ build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+ src3->dimension(0) - vec_size_leftover) %
+ num_elems_processed_per_iteration));
+
+ _depth = src1->dimension(2);
+ _input1_width = src1->dimension(0);
+ _input2_width = src2->dimension(0);
+ _input3_width = src3->dimension(0);
+
+ // If soources have different quantization info set quantization parameters needed for the re-quantization process
+ const bool have_different_qinfo =
+ helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+ if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+ {
+ const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+ const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+ const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
+ const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+ build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+ build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
+ build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
+ build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
+ build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+ }
+ std::string kernel_name = "concatenate_width_x4";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x4_";
+ _config_id += lower_string(string_from_data_type(src1->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src1->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src2->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src3->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src3->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src4->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src4->dimension(1));
+}
+
+void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src0 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+ const auto src1 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+ const auto src2 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+ const auto src3 =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ Window slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src0, slice);
+ add_4D_tensor_argument(idx, src1, slice);
+ add_4D_tensor_argument(idx, src2, slice);
+ add_4D_tensor_argument(idx, src3, slice);
+ add_4D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_int>(idx++, _depth);
+ _kernel.setArg<cl_int>(idx++, _input1_width);
+ _kernel.setArg<cl_int>(idx++, _input2_width);
+ _kernel.setArg<cl_int>(idx++, _input3_width);
+ enqueue(queue, *this, window, lws_hint());
+ } while (window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000000..f589b8ac1a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 4 tensors.
+ * All source tensors will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenate4TensorsKernel : public IClKernel
+{
+public:
+ ClWidthConcatenate4TensorsKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
+ /** Initialise the kernel's sources and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: All.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1
+ * @param[in] src3 Third source tensor info. Data types supported: same as @p src1
+ * @param[in] src4 Fourth source tensor info. Data types supported: same as @p src1
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *src3,
+ ITensorInfo *src4,
+ ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *src3,
+ const ITensorInfo *src4,
+ const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ int32_t _depth{0};
+ int32_t _input1_width{0};
+ int32_t _input2_width{0};
+ int32_t _input3_width{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
new file mode 100644
index 0000000000..989de4a7b7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
+
+ for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+ClWidthConcatenateKernel::ClWidthConcatenateKernel()
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
+ return Status{};
+}
+
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *src,
+ unsigned int width_offset,
+ ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
+
+ if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+ }
+ _depth = src->dimension(2);
+ std::string kernel_name = "concatenate_width";
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, window);
+ add_4D_tensor_argument(idx, dst, window);
+ _kernel.setArg<cl_uint>(idx++, _depth);
+ enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
new file mode 100644
index 0000000000..c10d6a4dc6
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel.
+ * The source tensor will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenateKernel : public IClKernel
+{
+public:
+ ClWidthConcatenateKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
+ /** Initialise the kernel's source and destination
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: All.
+ * @param[in] width_offset The offset on the X axis.
+ * @param[in,out] dst Destination tensor info. Data types supported: same as @p src.
+ *
+ */
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClWidthConcatenateKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+ int32_t _depth{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
new file mode 100644
index 0000000000..58c01d4da5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+ "Winograd filter transform not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+ input->dimension(idx_h) != kernel_size.height);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(output);
+
+ const unsigned int num_elems_processed_per_iteration_x =
+ input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+ const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
+ const unsigned int num_elems_read_per_iteration_z =
+ input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+
+ Window win =
+ calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+ num_elems_read_per_iteration_z));
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+ return std::make_pair(Status{}, win_collapsed);
+}
+} // namespace
+
+ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
+{
+ _type = CLKernelType::WINOGRAD;
+}
+
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*dst,
+ src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
+ auto padding_info = get_padding_info({src, dst});
+
+ // Set build options
+ CLBuildOptions build_opts;
+
+ // For NHWC layouts pass tensor dimesions at runtime
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ _src_dim_z = src->dimension(2);
+ }
+ else
+ {
+ build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2)));
+ }
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
+ build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+
+ // Create kernel
+ std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+ kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
+
+ return Status{};
+}
+
+void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Setup output window
+ Window window_out;
+ window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0);
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, window);
+ add_3D_tensor_argument(idx, dst, window_out);
+ if (src->info()->data_layout() == DataLayout::NHWC)
+ {
+ _kernel.setArg<cl_uint>(idx++, _src_dim_z);
+ }
+ enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
new file mode 100644
index 0000000000..6e439f0c99
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the Winograd filter transform kernel. */
+class ClWinogradFilterTransformKernel : public IClKernel
+{
+public:
+ ClWinogradFilterTransformKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel);
+ /** Set the input and output tensor.
+ *
+ * @note Winograd filter transform supports the following configurations for NCWH data layout
+ * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+ * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * @note Winograd filter transform supports the following configurations for NHWC data layout
+ * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * Strides: only unit strides
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
+ * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
+ * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClWinogradFilterTransformKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int32_t _src_dim_z{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
new file mode 100644
index 0000000000..54c48986fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+ "Winograd input transform only supports unit strides");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+ "Winograd input transform not supported");
+
+ ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_UNUSED(output_tile_size);
+ ARM_COMPUTE_UNUSED(kernel_size);
+
+ // Validate configured output
+ if (output->total_size() != 0)
+ {
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ bool window_changed = false;
+ int num_elems_processed_per_iteration = 1;
+
+ if (input->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of FP16 computation, we can perform more
+ // output feature maps in a single work-item.
+ // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+ // improve the performance. However, in order to make the implementation simpler,
+ // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+ // Note: At the moment, only Winograd Input Transform 3x3 can support N0 != 1
+ const DataType dt = input->data_type();
+ const size_t dim0 = input->dimension(0);
+ const size_t k_sz = winograd_info.kernel_size.area();
+ const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+ if (cond)
+ {
+ if (k_sz == 3 || k_sz == 9)
+ {
+ num_elems_processed_per_iteration = 2;
+ }
+ }
+ }
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ if (input->data_layout() == DataLayout::NCHW)
+ {
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+
+ unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
+ unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
+
+ AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+ num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+ window_changed = update_window_and_padding(win, input_access);
+ }
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+ClWinogradInputTransformKernel::ClWinogradInputTransformKernel()
+{
+ _type = CLKernelType::WINOGRAD;
+}
+
+BorderSize ClWinogradInputTransformKernel::border_size() const
+{
+ return _border_size;
+}
+
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
+
+ auto padding_info = get_padding_info({src, dst});
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+
+ _data_layout = src->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+ const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
+ kernel_size, output_tile_size, conv_info);
+
+ _num_tiles_x = num_tiles.width;
+ _num_tiles_y = num_tiles.height;
+
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
+
+ ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1)));
+ const size_t total_batches = src->tensor_shape().total_size_upper(3);
+
+ // Create window and update padding
+ auto win_config = validate_and_configure_window(src, dst, winograd_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
+
+ _src_width = src->dimension(idx_w);
+ _src_height = src->dimension(idx_h);
+
+ CLBuildOptions build_opts;
+ if (_data_layout == DataLayout::NHWC)
+ {
+ build_opts.add_option("-DNHWC");
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+ build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
+ build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
+ build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
+ }
+ else
+ {
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+ build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
+ build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
+ build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+ }
+
+ // Create kernel
+ std::string kernel_name =
+ "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+
+ // Get the maximum dimension from the tile size
+ const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
+
+ // Check optimized kernel if output_dims == 2x2
+ if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+ {
+ _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
+ }
+
+ // Append stepz and data layout
+ kernel_name += "_stepz";
+ kernel_name += support::cpp11::to_string(_step_z);
+ kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ _border_size = BorderSize(src->padding());
+
+ ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
+
+ _config_id = kernel_name;
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_info.pad_left());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_info.pad_top());
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(_data_layout));
+}
+
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+ return Status{};
+}
+
+void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const size_t total_batches = window.shape().total_size_upper(3);
+
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+
+ if (_data_layout == DataLayout::NHWC)
+ {
+ Window slice = window_collapsed.first_slice_window_3D();
+ slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
+ slice.set(2, Window::Dimension(0, total_batches, 1));
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, slice);
+ add_4D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_uint>(idx++, _src_width);
+ _kernel.setArg<cl_uint>(idx++, _src_height);
+ _kernel.setArg<cl_uint>(idx++, _num_tiles_x);
+ _kernel.setArg<cl_uint>(idx++, _num_tiles_y);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ else
+ {
+ Window slice = window_collapsed.first_slice_window_3D();
+ slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
+ slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
+
+ ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
+ slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[3]));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, src, slice);
+ add_3D_tensor_argument(idx, dst, slice);
+
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window_collapsed.slide_window_slice_3D(slice));
+ }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
new file mode 100644
index 0000000000..cebebea1d3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform Winograd input transform.*/
+class ClWinogradInputTransformKernel : public IClKernel
+{
+public:
+ ClWinogradInputTransformKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel);
+ /** Set the input and output of the kernel.
+ *
+ * @note Winograd input transform supports the following configurations for NCWH data layout
+ * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+ * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * @note Winograd input transform supports the following configurations for NHWC data layout
+ * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * Strides: only unit strides
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The input tensor info to transform. Data types supported: F16/F32
+ * @param[in] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
+ * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClWinogradInputTransformKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ BorderSize _border_size{0};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ int _num_tiles_x{0};
+ int _num_tiles_y{0};
+ unsigned int _step_z{1};
+ int32_t _src_width{0};
+ int32_t _src_height{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000000..89c80c55ef
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const WinogradInfo &winograd_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
+
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D input_dimensions = winograd_info.input_dimensions;
+ const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+ (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+ "Winograd output transform not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
+
+ // Compute number of elements to process in the X and Y direction
+ // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+ const Size2D num_tiles =
+ compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
+
+ if (bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ }
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *bias,
+ ITensorInfo *output,
+ const Size2D &output_tile_size)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(bias);
+
+ unsigned int num_elems_processed_per_iteration = 1;
+
+ if (input->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of FP16 computation, we can perform more
+ // output feature maps in a single work-item.
+ // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+ // improve the performance. However, in order to make the implementation simpler,
+ // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+ const DataType dt = input->data_type();
+ const size_t dim0 = input->dimension(0);
+ const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+ if (cond)
+ {
+ num_elems_processed_per_iteration = 2;
+ }
+ }
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ if (output->data_layout() == DataLayout::NCHW)
+ {
+ const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
+ const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+ num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ }
+
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
+{
+ _type = CLKernelType::WINOGRAD;
+}
+
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst,
+ src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second);
+
+ auto padding_info = get_padding_info({src, bias, dst});
+
+ _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
+
+ // Compute num_tiles_x
+ const Size2D input_dimensions = winograd_info.input_dimensions;
+ const Size2D kernel_size = winograd_info.kernel_size;
+ const Size2D output_tile_size = winograd_info.output_tile_size;
+ const PadStrideInfo conv_info = winograd_info.convolution_info;
+ const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height =
+ get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+ const Size2D num_tiles =
+ compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
+ const size_t total_batches = dst->tensor_shape().total_size_upper(3);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+ if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+ {
+ build_opts.add_option("-DVEC_SIZE=2");
+ }
+ else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+ {
+ build_opts.add_option("-DVEC_SIZE=4");
+ }
+
+ _num_tiles_x = num_tiles.width;
+
+ // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+ const GPUTarget gpu_target = get_target();
+ const auto act_function = act_info.activation();
+ const auto src_data_type = src->data_type();
+
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_opts.add_option("-cl-unsafe-math-optimizations");
+ }
+ else
+ {
+ build_opts.add_option("-cl-fast-relaxed-math");
+ }
+
+ if (_is_nhwc)
+ {
+ build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+ build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
+ build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
+ build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
+ build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+ }
+ else
+ {
+ build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
+ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+ build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
+ build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width)));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+ build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+ build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
+ build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
+ }
+
+ // Storing tensor dimensions to be sent later as kernel arguments
+ _src_height = src->dimension(1);
+ _dst_width = dst->dimension(idx_width);
+ _dst_height = dst->dimension(idx_height);
+
+ // Create kernel
+ std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+ kernel_size.to_string() + "_" +
+ lower_string(string_from_data_layout(winograd_info.output_data_layout));
+
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(src_data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(dst->dimension(1));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
+}
+
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const WinogradInfo &winograd_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+ (bias != nullptr ? bias->clone().get() : nullptr),
+ dst->clone().get(), winograd_info.output_tile_size)
+ .first);
+ return Status{};
+}
+
+void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+
+ // Get initial windows
+ Window slice = window_collapsed.first_slice_window_4D();
+ slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Setup output slice
+ Window slice_out(slice);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ if (bias != nullptr)
+ {
+ unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(bias->info()->tensor_shape());
+ add_1D_tensor_argument(idx1, bias, slice_biases);
+ }
+
+ if (_is_nhwc)
+ {
+ unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
+ _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
+ _kernel.setArg<cl_int>(idx2++, _src_height);
+ _kernel.setArg<cl_int>(idx2++, _dst_width);
+ _kernel.setArg<cl_int>(idx2++, _dst_height);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, src, slice);
+ add_4D_tensor_argument(idx, dst, slice_out);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
new file mode 100644
index 0000000000..65bb963061
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the Winograd output transform kernel. */
+class ClWinogradOutputTransformKernel : public IClKernel
+{
+public:
+ ClWinogradOutputTransformKernel();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel);
+ /** Set the input and output tensor.
+ *
+ * @note Winograd output transform supports the following configurations for NCWH data layout
+ * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+ * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * @note Winograd output transform supports the following configurations for NHWC data layout
+ * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+ * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+ *
+ * Strides: only unit strides
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32.
+ * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src
+ * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src
+ * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context,
+ ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const WinogradInfo &winograd_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClWinogradOutputTransformKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const WinogradInfo &winograd_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ bool _is_nhwc{false};
+ int32_t _src_height{0};
+ int32_t _dst_width{0};
+ int32_t _dst_height{0};
+ int32_t _num_tiles_x{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
new file mode 100644
index 0000000000..b5ebac3b49
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <limits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+ unsigned int n,
+ unsigned int m0,
+ unsigned int n0,
+ unsigned int k0,
+ unsigned int v0,
+ unsigned int h0,
+ bool lhs_interleave,
+ bool rhs_interleave,
+ bool lhs_transpose,
+ bool rhs_transpose,
+ bool export_to_cl_image)
+{
+ ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
+ ARM_COMPUTE_ERROR_ON(v0 == 0);
+ v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
+
+ if (h0 == 0)
+ {
+ // When h0 is 0, we should take the maximum H0 possible
+ h0 = std::max(n / n0, 1U);
+ }
+ else
+ {
+ h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
+ }
+
+ const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
+ const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
+
+ return std::make_pair(lhs_info, rhs_info);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b,
+ DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+ "The fallback GeMM configuration cannot have export_to_cl_image = true");
+
+ const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
+ const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
+ const TensorInfo tensor_reshaped_info(shape, 1, data_type);
+
+ if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+ {
+ return info_img;
+ }
+ else
+ {
+ return info_buf;
+ }
+}
+
+void update_padding_for_cl_image(ITensorInfo *tensor)
+{
+ constexpr unsigned int num_floats_per_pixel = 4;
+
+ const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
+ const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
+
+ ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
+ if (pixel_alignment == 0)
+ {
+ return;
+ }
+
+ const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
+ const unsigned int round_up_width =
+ ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+ const unsigned int padding = round_up_width - stride_y_in_elements;
+
+ tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
+}
+
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
+{
+ if (rhs_info.export_to_cl_image)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+ "Export to cl_image only supported with n0 = 4, 8 or 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+ "Export to cl_image only supported with k0 = 4, 8 or 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+ "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+ "Impossible to retrieve the cl_image pitch alignment");
+
+ // Check the width and height of the output tensor.
+ // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
+ const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+ const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+ "Not supported width for cl_image");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+ "Not supported height for cl_image");
+ }
+
+ return Status{};
+}
+
+bool is_mmul_kernel_preferred(const unsigned int m,
+ const unsigned int n,
+ const unsigned int k,
+ const unsigned int b,
+ const DataType data_type,
+ unsigned int &best_m0,
+ unsigned int &best_n0)
+{
+ ARM_COMPUTE_UNUSED(n, k, b, data_type);
+
+ const unsigned int mmul_k0 = 4;
+ best_m0 = 4;
+ best_n0 = 4;
+
+ const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, best_m0);
+ const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / best_m0;
+ const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
+ const unsigned int gws_y = ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0;
+
+ return ((k % mmul_k0) == 0) && (gws_y > 4);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ size_t min_acc = std::numeric_limits<size_t>::max();
+ size_t min_idx = 0;
+
+ ARM_COMPUTE_ERROR_ON(configs.size() == 0);
+ const size_t num_rows = configs.size();
+ const size_t num_cols = configs[0].size();
+
+ ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+ "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+ ARM_COMPUTE_UNUSED(num_cols);
+
+ // Find nearest GeMM workload
+ // Note: the workload does not depend on the K dimension
+ for (size_t y = 0; y < num_rows; ++y)
+ {
+ size_t mc0 = static_cast<size_t>(configs[y][0]);
+ size_t nc0 = static_cast<size_t>(configs[y][1]);
+ size_t kc0 = static_cast<size_t>(configs[y][2]);
+ size_t bc0 = static_cast<size_t>(configs[y][3]);
+
+ size_t acc = 0;
+ acc += (m - mc0) * (m - mc0);
+ acc += (n - nc0) * (n - nc0);
+ acc += (k - kc0) * (k - kc0);
+ acc += (b - bc0) * (b - bc0);
+ acc = std::sqrt(acc);
+ if (acc < min_acc)
+ {
+ min_acc = acc;
+ min_idx = y;
+ }
+ }
+
+ // Get the configuration from the nearest GeMM shape
+ const int m0 = configs[min_idx][4];
+ const int n0 = configs[min_idx][5];
+ const int k0 = configs[min_idx][6];
+ const int v0 = configs[min_idx][7];
+ const int h0 = configs[min_idx][8];
+ const int i_lhs = configs[min_idx][9];
+ const int i_rhs = configs[min_idx][10];
+ const int t_lhs = configs[min_idx][11];
+ const int t_rhs = configs[min_idx][12];
+ const int im_rhs = configs[min_idx][13];
+
+ return configure_lhs_rhs_info(m, n, m0, n0, k0, v0, h0, i_lhs, i_rhs, t_lhs, t_rhs, im_rhs);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
new file mode 100644
index 0000000000..84776fb207
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H
+#define ARM_COMPUTE_CL_GEMM_HELPERS_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>;
+
+/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * @param[in] m Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0 Number of rows processed by each thread/work-item
+ * @param[in] n0 Number of columns processed by each thread/work-item
+ * @param[in] k0 Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0 Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0 Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose True if the (k0xn0) block has to be transposed before been stored
+ * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+ unsigned int n,
+ unsigned int m0,
+ unsigned int n0,
+ unsigned int k0,
+ unsigned int v0,
+ unsigned int h0,
+ bool lhs_interleave,
+ bool rhs_interleave,
+ bool lhs_transpose,
+ bool rhs_transpose,
+ bool export_to_cl_image = false);
+
+/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support,
+ * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return
+ * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support.
+ *
+ * @param[in] info_img GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support
+ * @param[in] info_buf GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used
+ * @param[in] n Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b Batch size
+ * @param[in] data_type Data type
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b,
+ DataType data_type);
+
+/** Update padding required to export the OpenCL buffer to OpenCL image2d
+ *
+ * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
+ */
+void update_padding_for_cl_image(ITensorInfo *tensor);
+
+/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
+ *
+ * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
+ * @param[in] rhs_info @ref GEMMRHSMatrixInfo
+ *
+ * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
+ */
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
+
+/** Determine if the MMUL kernels should be preferred
+ *
+ * @param[in] m Number of rows of the LHS matrix
+ * @param[in] n Number of columns of the RHS matrix
+ * @param[in] k Number of columns of the LHS matrix, rows of the RHS matrix
+ * @param[in] b Batch size
+ * @param[in] data_type Data type FP32/FP16
+ * @param[in, out] best_m0 Suggested M0 (number of rows of the output block) for the kernel
+ * @param[in, out] best_n0 Suggested N0 (number of columns of the output block) for the kernel
+ *
+ * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
+ */
+bool is_mmul_kernel_preferred(const unsigned int m,
+ const unsigned int n,
+ const unsigned int k,
+ const unsigned int b,
+ const DataType data_type,
+ unsigned int &best_m0,
+ unsigned int &best_n0);
+
+/** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
+ *
+ * @param[in] configs List of best configurations for a limited number of GeMM shapes
+ * @param[in] m Number of rows of the LHS matrix
+ * @param[in] n Number of columns of the RHS matrix
+ * @param[in] k Number of columns of the LHS matrix, rows of the RHS matrix
+ * @param[in] b Batch size
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */
diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
new file mode 100644
index 0000000000..9d08633963
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
+#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Basic container for the OpenCL GEMM configuration functions */
+template <class T>
+class CLGEMMConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+ /** Alias for Int8 index */
+ static constexpr size_t DT_INT8 = 2;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for GEMM F32
+ * @param[in] func_f16 Function to call for GEMM F16
+ * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+ {
+ }
+
+ /** Method to return the GEMM configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_INT8);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 3> _configs;
+};
+
+/** Basic interface for the GEMM kernel configuration */
+class IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClGemmKernelConfig(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClGemmKernelConfig() = default;
+ /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
+ *
+ * @param[in] m Number of rows LHS matrix
+ * @param[in] n Number of columns RHS matrix
+ * @param[in] k Number of columns LHS matrix or number of rows RHS matrix
+ * @param[in] b Batch size
+ * @param[in] data_type Data type
+ */
+ virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
new file mode 100644
index 0000000000..2f37eef31f
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+ &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+ &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+ &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+ &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+ &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+ &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+ &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+ &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+ &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+
+ switch (_target)
+ {
+ case GPUTarget::G76:
+ func = configs_G76.get_function(data_type);
+ break;
+ case GPUTarget::G71:
+ func = configs_G71.get_function(data_type);
+ break;
+ default:
+ func = configs_G7x.get_function(data_type);
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 8192)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 1, false, false, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 16384)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+ }
+ }
+ }
+ else
+ {
+ if (m == 1)
+ {
+ if (n < 8192)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n > 4196)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ if (k < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
+ }
+ else if (k >= 2048 && k < 16384)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false);
+ }
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 16384)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
new file mode 100644
index 0000000000..f822daae53
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
new file mode 100644
index 0000000000..f87fb1b659
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
+ &ClGemmDefaultConfigNativeMidgard::default_q8);
+
+ auto func = configs_default.get_function(data_type);
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ const unsigned int m0 = std::min(m, static_cast<unsigned int>(4));
+ const unsigned int n0 = std::min(n, static_cast<unsigned int>(4));
+
+ return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
new file mode 100644
index 0000000000..fa76c5dba7
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
new file mode 100644
index 0000000000..97a1298b0a
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+ &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+ &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+
+ auto func = configs_default.get_function(data_type);
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 8192)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 8192)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if (m == 1)
+ {
+ if (n < 2048)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+ }
+ else if (n >= 2048 && n < 16384)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+ }
+ }
+ }
+ else
+ {
+ if (m == 1)
+ {
+ if (n < 8192)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
+ }
+ }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
new file mode 100644
index 0000000000..c91b095279
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
new file mode 100644
index 0000000000..22aa1e2034
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMNative factory class */
+class ClGemmNativeKernelConfigurationFactory final
+{
+public:
+ /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return CLGEMMNative kernel configuration class
+ */
+ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
new file mode 100644
index 0000000000..c956c347ef
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+ &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+ &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+ &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+ &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+ &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+ &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+
+ switch (_target)
+ {
+ case GPUTarget::G76:
+ func = configs_G76.get_function(data_type);
+ break;
+ case GPUTarget::G52:
+ func = configs_G52.get_function(data_type);
+ break;
+ default:
+ func = configs_G7x.get_function(data_type);
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
+ }
+ }
+ else
+ {
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ if (workload <= 274.4000f)
+ {
+ if (r_nk <= 0.7461f)
+ {
+ if (r_mn <= 21.1667f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ if (r_mk <= 17.3926f)
+ {
+ if (workload <= 542.4000f)
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ if (r_nk <= 0.5463f)
+ {
+ if (workload <= 11767.6001f)
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+ if (workload <= 323.4000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ // Get lhs_info/rhs_info in case of OpenCL buffer
+ if (n <= 4)
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+ }
+ else
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+ }
+
+ // Get lhs_info/rhs_info in case of OpenCL image
+ // Condition on the GPU workload
+ if ((m / 4) * (n / 4) >= 2560)
+ {
+ // Big workload
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+ }
+ else
+ {
+ // Small workload
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+ }
+
+ const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+ const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+ const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32);
+
+ // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
+ const bool use_cl_image2d = (n <= 4) ? false : true;
+
+ if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+ {
+ return std::make_pair(lhs_info_img, rhs_info_img);
+ }
+ else
+ {
+ return std::make_pair(lhs_info_buf, rhs_info_buf);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+ if (workload <= 1595.2000f)
+ {
+ if (r_mk <= 2.1044f)
+ {
+ if (workload <= 870.4000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
+ }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
new file mode 100644
index 0000000000..9227ec2551
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMReshaped configuration */
+class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
new file mode 100644
index 0000000000..70b324eb5a
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+ &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+ &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+ &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+ &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+
+ switch (_target)
+ {
+ case GPUTarget::G78:
+ func = configs_G78.get_function(data_type);
+ break;
+ case GPUTarget::G77:
+ default:
+ func = configs_G77.get_function(data_type);
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
+
+ if (r_mk <= 0.11824845522642136)
+ {
+ if (workload <= 880.0)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+ }
+ else
+ {
+ if (r_nk <= 0.42521367967128754)
+ {
+ if (workload <= 1726.4000244140625)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ else
+ {
+ if (workload <= 1241.6000366210938)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
+ }
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 11404.7998046875)
+ {
+ if (r_mk <= 1.0126488208770752)
+ {
+ if (r_mn <= 2.545312523841858)
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+ }
+ }
+ else
+ {
+ if (workload <= 2881.199951171875)
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ }
+ else
+ {
+ if (r_nk <= 0.5765306055545807)
+ {
+ if (r_mn <= 6.010416746139526)
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+ if (workload <= 1288.0000f)
+ {
+ if (workload <= 505.6000f)
+ {
+ if (r_mn <= 0.4466f)
+ {
+ if (r_nk <= 0.2384f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
+ }
+ }
+ else
+ {
+ if (r_mn <= 0.2250f)
+ {
+ if (r_mn <= 0.1599f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ if (r_mk <= 0.7609f)
+ {
+ if (r_mn <= 2.5453f)
+ {
+ if (workload <= 1089.6000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 5434.4001f)
+ {
+ if (workload <= 1603.2000f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_nk <= 0.6192f)
+ {
+ if (r_mn <= 16.1016f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (workload <= 2750.0000f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_mk <= 6.3151f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ if (r_mk <= 0.0387f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_mk <= 2.5859f)
+ {
+ if (r_mk <= 0.2734f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (r_mk <= 25.7500f)
+ {
+ if (r_mk <= 0.3615f)
+ {
+ if (r_mn <= 0.0913f)
+ {
+ if (r_mk <= 0.0683f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ if (workload <= 11174.3999f)
+ {
+ if (r_mk <= 0.8047f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (workload <= 7185.5999f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 17917.5000f)
+ {
+ if (r_mk <= 1.5078f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ if (workload <= 34449.6016f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (r_mk <= 331.1111f)
+ {
+ if (workload <= 53397.5996f)
+ {
+ if (r_mn <= 57.8063f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+ }
+ }
+ else
+ {
+ if (r_nk <= 0.9211f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 38070.4004f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+ if (workload <= 801.6000f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_mn <= 0.1211f)
+ {
+ if (workload <= 3296.0000f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_nk <= 1.0625f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 5068.8000f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ if (r_nk <= 0.2361f)
+ {
+ if (workload <= 12630.0000f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1);
+ }
+ }
+ else
+ {
+ if (workload <= 178790.3984f)
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+ }
+ }
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1);
+ }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
new file mode 100644
index 0000000000..5f62efb59e
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMReshaped configuration */
+class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
new file mode 100644
index 0000000000..6327ee3027
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMReshaped factory class */
+class ClGemmReshapedKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return CLGEMMReshaped kernel configuration class
+ */
+ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
new file mode 100644
index 0000000000..c4825bfbeb
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu)
+ : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G76:
+ func = configs_G76.get_function(data_type);
+ break;
+ case GPUTarget::G51:
+ func = configs_G51.get_function(data_type);
+ break;
+ case GPUTarget::G52:
+ func = configs_G52.get_function(data_type);
+ break;
+ case GPUTarget::G31:
+ func = configs_G31.get_function(data_type);
+ break;
+ default:
+ func = configs_G7x.get_function(data_type);
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n <= 2548)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
+ }
+ else
+ {
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
+ if (m >= 28)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ const bool is_workload_big = ((m * n * b) / 16) >= 2048;
+
+ if (m == 1)
+ {
+ if (n >= 8192)
+ {
+ const unsigned int h0 = std::max(n / 4, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ if (n <= 204)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
+ }
+ }
+ }
+ else
+ {
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+ if (is_workload_big)
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+ }
+ }
+
+ // Get lhs_info/rhs_info in case of OpenCL image
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+ if (is_workload_big)
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+ }
+
+ const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+ const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+ const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32);
+
+ // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+ const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
+
+ if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+ {
+ return std::make_pair(lhs_info_img, rhs_info_img);
+ }
+ else
+ {
+ return std::make_pair(lhs_info_buf, rhs_info_buf);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ if (m == 1)
+ {
+ if (r_nk <= 0.4664f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ if (workload <= 274.4000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int n0 = n < 1280 ? 2 : 4;
+ const unsigned int h0 = std::max(n / n0, 1U);
+ return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ if (n > 2048)
+ {
+ const unsigned int h0 = std::max(n / 4, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ if (m == 1)
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+
+ if (r_mk <= 0.0026f)
+ {
+ if (r_nk <= 0.4664f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ else
+ {
+ if (r_mk <= 0.0148f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ }
+ else
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+
+ if (workload <= 362.6000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+ }
+ else
+ {
+ if (r_mn <= 22.6067f)
+ {
+ if (workload <= 708.8000f)
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
+ }
+ }
+ else
+ {
+ if (r_nk <= 0.0917f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+ }
+ else
+ {
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+
+ if (m == 1)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+ }
+ else
+ {
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+ if (workload <= 7449.60f)
+ {
+ if (workload <= 691.60f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
+ }
+ else
+ {
+ if (workload <= 4155.20f)
+ {
+ return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 16300.80f)
+ {
+ if (r_mn <= 44.56f)
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+ }
+ }
+ else
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ std::tie(lhs_info_img, rhs_info_img) =
+ configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+ std::tie(lhs_info_buf, rhs_info_buf) =
+ configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int n0 = n < 1280 ? 2 : 4;
+ const unsigned int h0 = std::max(n / n0, 1U);
+ return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if (m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 4, 1U);
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+ }
+ }
+ else
+ {
+ const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
+ if (m == 1)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
+ }
+}
+
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
new file mode 100644
index 0000000000..77c0c8d500
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
+class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
new file mode 100644
index 0000000000..da3e2ec912
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu)
+ : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+ ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+ CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+ &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+
+ switch (_target)
+ {
+ case GPUTarget::G78:
+ func = configs_G78.get_function(data_type);
+ break;
+ case GPUTarget::G710:
+ case GPUTarget::G610:
+ func = configs_G710.get_function(data_type);
+ break;
+ case GPUTarget::G715:
+ case GPUTarget::G615:
+ func = configs_G715.get_function(data_type);
+ break;
+ case GPUTarget::G77:
+ default:
+ func = configs_G77.get_function(data_type);
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+ return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ if (m == 1)
+ {
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+ if (r_mk <= 0.0064484127797186375)
+ {
+ if (r_mn <= 0.0028273810748942196)
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+
+ const unsigned int h0 = std::max(n / 4, 1U);
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0);
+ }
+ }
+ else
+ {
+ if (r_mk <= 0.020312500186264515)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0);
+ }
+ }
+ }
+ else
+ {
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+ if (workload <= 1999.2000122070312)
+ {
+ if (workload <= 747.1999816894531)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+ }
+ else
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ if (r_mn <= 0.03348214365541935)
+ {
+ if (r_mk <= 0.028125000186264515)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+ }
+ else
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ else
+ {
+ GEMMLHSMatrixInfo lhs_info_buf;
+ GEMMRHSMatrixInfo rhs_info_buf;
+ GEMMLHSMatrixInfo lhs_info_img;
+ GEMMRHSMatrixInfo rhs_info_img;
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+ std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const GeMMConfigsMatrix configs_1nkb_best = {
+ {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+ {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+ {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0}, {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+ {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+ {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+ {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+ {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+ const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+ {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+ {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+ {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+ {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+ {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+ {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+ {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+ {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+ {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+ {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+ {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+ {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+ {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+ {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+ {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+ {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+ {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+ {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+ {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+ {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+ {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+ {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_squared_best = {
+ {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+ {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+ {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+ {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+ {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+ {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_best_batched = {
+ {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+ {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+ {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+ {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+ {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+ {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+ const GeMMConfigsMatrix *configs_best_to_use = nullptr;
+ const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
+
+ if (b == 1)
+ {
+ constexpr float ratio_m_gt_n = 10.f;
+ constexpr float ratio_n_gt_m = 0.1f;
+ constexpr unsigned int n_small_thr = 4;
+ const float ratio = static_cast<float>(m) / static_cast<float>(n);
+
+ if (m == 1)
+ {
+ // We do not need fallback in this case, as we never use cl_image for the rhs tensor
+ configs_best_to_use = &configs_1nkb_best;
+ configs_fallback_to_use = &configs_1nkb_best;
+ }
+ else if (n <= n_small_thr && ratio > ratio_m_gt_n)
+ {
+ configs_best_to_use = &configs_mnkb_n_small_best;
+ configs_fallback_to_use = &configs_mnkb_n_small_fallback;
+ }
+ else if (ratio > ratio_m_gt_n)
+ {
+ configs_best_to_use = &configs_mnkb_m_gt_n_best;
+ configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
+ }
+ else if (ratio < ratio_n_gt_m)
+ {
+ configs_best_to_use = &configs_mnkb_n_gt_m_best;
+ configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_squared_best;
+ configs_fallback_to_use = &configs_mnkb_squared_fallback;
+ }
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_best_batched;
+ configs_fallback_to_use = &configs_mnkb_fallback_batched;
+ }
+
+ GEMMLHSMatrixInfo lhs_info0;
+ GEMMRHSMatrixInfo rhs_info0;
+ GEMMLHSMatrixInfo lhs_info1;
+ GEMMRHSMatrixInfo rhs_info1;
+
+ std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
+ std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+ DataType::F16);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if (m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
+ }
+ else
+ {
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
+ if (m >= 28)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+ if (m == 1)
+ {
+ if (workload <= 278.7000f)
+ {
+ if (workload <= 7.5000f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+ }
+ else
+ {
+ if (r_mn <= 0.0031f)
+ {
+ if (workload <= 256.6000f)
+ {
+ if (workload <= 16.7500f)
+ {
+ if (r_nk <= 1.6671f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ }
+ else
+ {
+ if (r_mk <= 0.0027f)
+ {
+ if (r_mk <= 0.0014f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ else
+ {
+ if (workload <= 8.9500f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 14.1500f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+ }
+ else
+ {
+ if (r_mk <= 0.0041f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 363.7000f)
+ {
+ if (r_mk <= 0.0031f)
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 1384.8000f)
+ {
+ if (workload <= 704.0000f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
+ }
+ }
+ else
+ {
+ if (workload <= 16761.6006f)
+ {
+ if (r_mn <= 187.1250f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
+ }
+ }
+ else
+ {
+ if (r_mk <= 432.4630f)
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1);
+ }
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+ const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+ const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+ const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+ if (m == 1)
+ {
+ const GeMMConfigsMatrix configs_mnkb_best = {
+ {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+ {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+ {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+ {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
+
+ return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
+ }
+ else
+ {
+ if (workload <= 1384.8000f)
+ {
+ if (r_nk <= 0.8333f)
+ {
+ if (r_mk <= 0.9119f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
+ }
+ else
+ {
+ if (r_nk <= 0.1181f)
+ {
+ return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
+ }
+ }
+ }
+ else
+ {
+ if (r_mk <= 1.0013f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+ }
+ }
+ }
+ else
+ {
+ if (workload <= 11404.7998f)
+ {
+ if (r_mk <= 2.2884f)
+ {
+ if (r_nk <= 0.9286f)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+ }
+ }
+ else
+ {
+ if (r_nk <= 1.1926f)
+ {
+ if (r_mn <= 1385.7917f)
+ {
+ return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 32, 0, 1, 1, 0, 0);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+ }
+ }
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ unsigned int best_m0;
+ unsigned int best_n0;
+
+ if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+ {
+ return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
+ }
+ else
+ {
+ return configure_G77_f32(m, n, k, b);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ const GeMMConfigsMatrix configs_1nkb_best = {
+ {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+ {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+ {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+ {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+ {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+ {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+ {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+ {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+ {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+ {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+ {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+ {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+ {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+ {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+ {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+ {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+ {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+ {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+ {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+ {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+ {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+ const GeMMConfigsMatrix configs_mnkb_squared_best = {
+ {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+ {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+ {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+ {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+ {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+ {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+ {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+ };
+
+ const GeMMConfigsMatrix configs_mnkb_best_batched = {
+ {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1}, {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+ {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
+
+ const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+ {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+ {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+ {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+ {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
+
+ const GeMMConfigsMatrix *configs_best_to_use = nullptr;
+ const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
+
+ if (b == 1)
+ {
+ constexpr float ratio_m_gt_n = 10.f;
+ constexpr float ratio_n_gt_m = 0.1f;
+ constexpr unsigned int n_small_thr = 4;
+ const float ratio = static_cast<float>(m) / static_cast<float>(n);
+
+ if (m == 1)
+ {
+ // We do not need fallback in this case, as we never use cl_image for the rhs tensor
+ configs_best_to_use = &configs_1nkb_best;
+ configs_fallback_to_use = &configs_1nkb_best;
+ }
+ else if (n <= n_small_thr && ratio > ratio_m_gt_n)
+ {
+ configs_best_to_use = &configs_mnkb_n_small_best;
+ configs_fallback_to_use = &configs_mnkb_n_small_best;
+ }
+ else if (ratio > ratio_m_gt_n)
+ {
+ configs_best_to_use = &configs_mnkb_m_gt_n_best;
+ configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
+ }
+ else if (ratio < ratio_n_gt_m)
+ {
+ configs_best_to_use = &configs_mnkb_n_gt_m_best;
+ configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_squared_best;
+ configs_fallback_to_use = &configs_mnkb_squared_fallback;
+ }
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_best_batched;
+ configs_fallback_to_use = &configs_mnkb_fallback_batched;
+ }
+
+ GEMMLHSMatrixInfo lhs_info0;
+ GEMMRHSMatrixInfo rhs_info0;
+ GEMMLHSMatrixInfo lhs_info1;
+ GEMMRHSMatrixInfo rhs_info1;
+
+ std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
+ std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
+
+ return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+ DataType::F16);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ unsigned int best_m0;
+ unsigned int best_n0;
+
+ if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+ {
+ return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
+ }
+ else
+ {
+ return configure_G78_f16(m, n, k, b);
+ }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
new file mode 100644
index 0000000000..a0ea337eb1
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
+class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
new file mode 100644
index 0000000000..1f0c5c2d87
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMReshapedOnlyRHS factory class */
+class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return CLGEMMReshapedOnlyRHS kernel configuration class
+ */
+ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
new file mode 100644
index 0000000000..689a743fdf
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/math/Math.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+Status validate_matmul_input_shapes(const TensorShape &lhs_shape,
+ const TensorShape &rhs_shape,
+ const MatMulKernelInfo &matmul_kernel_info)
+{
+ const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+ const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
+
+ constexpr size_t batch_dim_start = 2;
+ for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ int mmul_m0,
+ int mmul_n0)
+{
+ ARM_COMPUTE_UNUSED(lhs, rhs);
+
+ const Window win = calculate_max_window(*dst, Steps(1, 1));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
+
+ // Reconfigure window size, one arm_matrix_multiply call needs 16 threads to finish.
+ Window::Dimension x_dimension = collapsed.x();
+ Window::Dimension y_dimension = collapsed.y();
+
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
+
+ const int m0 = std::min(matmul_kernel_info.m0, m);
+ const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+ // Make M and N multiple of M0 and N0 respectively
+ const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(n, n0);
+ const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, m0);
+
+ // Divide M and N by M0 and N0 respectively
+ const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / n0;
+ const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / m0;
+
+ // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_m0 respectively
+ const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+ const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_m0);
+
+ // Ensure x_dimension is multiple of MMUL block size (mmul_m0 * mmul_n0)
+ x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_m0);
+ y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+ collapsed.set(Window::DimX, x_dimension);
+ collapsed.set(Window::DimY, y_dimension);
+
+ return std::make_pair(Status{}, collapsed);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
new file mode 100644
index 0000000000..c2ae2a67f4
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+#define ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Validate the input shapes of Matmul operation
+ *
+ * @param[in] lhs_shape Lhs tensor shape
+ * @param[in] rhs_shape Rhs tensor shape
+ * @param[in] matmul_kernel_info Matmul kernel info
+ *
+ * @return true if the shapes and matmul kernel info matches
+ */
+Status validate_matmul_input_shapes(const TensorShape &lhs_shape,
+ const TensorShape &rhs_shape,
+ const MatMulKernelInfo &matmul_kernel_info);
+
+/** Validate and configure window for Matmul MMUL kernels
+ *
+ * @param[in] lhs Lhs tensor info
+ * @param[in] rhs Rhs tensor info
+ * @param[in] dst Dst tensor info
+ * @param[in] matmul_kernel_info Matmul kernel info
+ * @param[in] mmul_m0 Number of rows in the MMUL block
+ * @param[in] mmul_n0 Number of columns in the MMUL block
+ *
+ * @return a pair of Status and Window object
+ */
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulKernelInfo &matmul_kernel_info,
+ int mmul_m0,
+ int mmul_n0);
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif // ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H