aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp138
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.h41
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp129
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.h35
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp122
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.h36
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.cpp28
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.h6
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp48
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.h16
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp96
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.h5
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp176
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.h106
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.cpp111
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.h21
-rw-r--r--src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp121
-rw-r--r--src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h90
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp30
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h5
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp89
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h23
-rw-r--r--src/core/CL/kernels/CLDepthConvertLayerKernel.cpp160
-rw-r--r--src/core/CL/kernels/CLDepthConvertLayerKernel.h91
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.h4
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp434
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h114
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp464
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h113
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp416
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h114
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp132
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h85
-rw-r--r--src/core/CL/kernels/CLDequantizationLayerKernel.cpp159
-rw-r--r--src/core/CL/kernels/CLDequantizationLayerKernel.h79
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp585
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.h126
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.cpp48
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.h18
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.cpp54
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.h9
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.cpp61
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.h9
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp63
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.h18
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp133
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.h41
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp334
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h108
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp297
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h123
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp573
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h155
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp215
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h116
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp274
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h136
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp153
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h89
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp160
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h101
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp158
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h102
-rw-r--r--src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp221
-rw-r--r--src/core/CL/kernels/CLGEMMLowpReductionKernel.h176
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp544
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h122
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp422
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h127
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp443
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h188
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp449
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h168
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp221
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h105
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp175
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h135
-rw-r--r--src/core/CL/kernels/CLGatherKernel.cpp49
-rw-r--r--src/core/CL/kernels/CLGatherKernel.h10
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp39
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.h7
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp441
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.h136
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp154
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h70
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp93
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.h11
-rw-r--r--src/core/CL/kernels/CLLKTrackerKernel.cpp314
-rw-r--r--src/core/CL/kernels/CLLKTrackerKernel.h206
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp51
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h15
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp55
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h5
-rw-r--r--src/core/CL/kernels/CLMinMaxLayerKernel.cpp170
-rw-r--r--src/core/CL/kernels/CLMinMaxLayerKernel.h87
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.cpp165
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.h7
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp98
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h9
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.h20
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp481
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h196
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.cpp86
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.h27
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp51
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h9
-rw-r--r--src/core/CL/kernels/CLQuantizationLayerKernel.cpp178
-rw-r--r--src/core/CL/kernels/CLQuantizationLayerKernel.h86
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.cpp58
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.h18
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.cpp149
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.h36
-rw-r--r--src/core/CL/kernels/CLRangeKernel.cpp45
-rw-r--r--src/core/CL/kernels/CLRangeKernel.h1
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp333
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.h19
-rw-r--r--src/core/CL/kernels/CLRemapKernel.cpp116
-rw-r--r--src/core/CL/kernels/CLRemapKernel.h81
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.cpp46
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.h1
-rw-r--r--src/core/CL/kernels/CLReverseKernel.cpp48
-rw-r--r--src/core/CL/kernels/CLReverseKernel.h48
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp286
-rw-r--r--src/core/CL/kernels/CLScaleKernel.h93
-rw-r--r--src/core/CL/kernels/CLSelectKernel.cpp37
-rw-r--r--src/core/CL/kernels/CLSelectKernel.h7
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp370
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.h158
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp104
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.h35
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp27
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.h4
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.cpp41
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.h17
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.cpp130
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.h29
-rw-r--r--src/core/CL/kernels/CLTileKernel.cpp47
-rw-r--r--src/core/CL/kernels/CLTileKernel.h5
-rw-r--r--src/core/CL/kernels/CLTransposeKernel.cpp116
-rw-r--r--src/core/CL/kernels/CLTransposeKernel.h64
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp168
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.h121
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp157
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.h115
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp256
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.h121
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp265
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.h127
-rw-r--r--src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h105
151 files changed, 2804 insertions, 16250 deletions
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 909972482f..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,37 +29,36 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
- }
- if(prev_output != nullptr && prev_output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, DataType::S32);
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+ DataType::U64);
}
return Status{};
@@ -67,59 +66,70 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
} // namespace
CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
- : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, prev_output, output, axis, op);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape{ input->info()->tensor_shape() };
+ TensorShape output_shape{input->info()->tensor_shape()};
output_shape.set(axis, 1);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(DataType::S32)
+ .reset_padding()
+ .set_is_resizable(true));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, op));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
- auto padding_info = get_padding_info({ input, prev_output, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
- _prev_output = prev_output;
_output = output;
_reduction_axis = axis;
_op = op;
// Set build options
- const auto vector_size = (axis == 0) ? 16U : adjust_vec_size(16U, input->info()->dimension(0));
+ const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
+ const auto vector_size = (adjusted_vector_size == 3U && axis == 0U)
+ ? 2U
+ : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
CLBuildOptions build_opts;
- build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % vector_size));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
build_opts.add_option("-DDATA_TYPE_OUTPUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DCOND_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DUNROLL_WITH_PRAGMA=1");
// Create kernel
- cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
- switch(axis)
+ switch (axis)
{
case 0:
- {
- const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
kernel_axis_name = "x";
- lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), vector_size);
- }
- break;
+ break;
case 1:
build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
kernel_axis_name = "y";
@@ -139,15 +149,18 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
_kernel = create_kernel(compile_context, "arg_min_max_" + kernel_axis_name, build_opts.options());
// Configure kernel window
- Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output->info()) : (*input->info()), Steps(vector_size));
- ICLKernel::configure_internal(win, lws_hint);
+ Window win = calculate_max_window(*input->info(), Steps(vector_size));
+ ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
}
@@ -156,43 +169,36 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
// Set out window
Window out_window(window);
+ Window in_window(window);
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+ in_window.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
// Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
+ Window in_slice = in_window.first_slice_window_2D();
Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
- // Set local sums buffer
- unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
do
{
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input, in_slice);
- if(_prev_output != nullptr)
- {
- add_2D_tensor_argument(idx, _prev_output, in_slice);
- }
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
break;
case 1:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ Window window_in{window};
+ window_in.set(Window::DimY,
+ Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
Window in_slice = window_in.first_slice_window_2D();
Window out_slice = window.first_slice_window_2D();
@@ -202,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_2D_tensor_argument(idx, _input, in_slice);
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
break;
case 2:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+ Window window_in{window};
+ window_in.set(Window::DimZ,
+ Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
Window in_slice = window_in.first_slice_window_3D();
Window out_slice = window.first_slice_window_3D();
@@ -220,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
}
break;
case 3:
{
// Get first input and output slices
- Window window_in{ window };
+ Window window_in{window};
window_in.set(3, Window::Dimension(0, 1, 1));
Window in_slice = window_in.first_slice_window_4D();
Window out_slice = window.first_slice_window_4D();
@@ -238,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, in_slice);
add_4D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 929677f905..fb3b41b0de 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -56,48 +57,46 @@ public:
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
- * @param[in] prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
- * Has to be nullptr for the first iteration
- * @param[out] output Destination tensor. Data types supported: U32/S32
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
- * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[out] output Destination tensor. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
*/
- void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
* @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
- * @param[in] prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
- * Has to be nullptr for the first iteration
* @param[out] output Destination tensor. Data types supported: U32/S32
* Output will have the same number of dimensions as input.
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
/** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
*
- * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
- * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
- * Has to be nullptr for the first iteration
- * @param[in] output Destination tensor info. Data types supported: U32/S32
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
- * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] output Destination tensor info. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
const ICLTensor *_input;
- const ICLTensor *_prev_output;
ICLTensor *_output;
unsigned int _reduction_axis;
ReductionOperation _op;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 44bdc6f587..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,49 +29,58 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
- if(beta != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ if (beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
}
- if(gamma != nullptr)
+ if (gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
}
- if(act_info.enabled())
+ if (act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+ act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ act !=
+ ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->element_size(), input->dimension(0));
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
bool window_changed = false;
- if(output != nullptr)
+ if (output != nullptr)
{
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
window_changed = update_window_and_padding(win, input_access, output_access);
@@ -101,29 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
window_changed = update_window_and_padding(win, input_access);
}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
- : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+ : _input(nullptr),
+ _output(nullptr),
+ _mean(nullptr),
+ _var(nullptr),
+ _beta(nullptr),
+ _gamma(nullptr),
+ _epsilon(0),
+ _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
}
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
- const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
- auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+ auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
_input = input;
_output = output;
_mean = mean;
@@ -138,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
(gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
- unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -153,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
// Create kernel
- _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel =
+ create_kernel(compile_context,
+ "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Set kernel static arguments
unsigned int include_output = (!_run_in_place) ? 1 : 0;
- unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
- if(_beta != nullptr)
+ unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() +
+ 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ if (_beta != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip beta parameter
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
}
_kernel.setArg<cl_float>(idx++, _epsilon);
- if(output != nullptr)
+ if (output != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
}
// Configure kernel window
- if(input->info()->data_layout() == DataLayout::NHWC)
+ if (input->info()->data_layout() == DataLayout::NHWC)
{
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
@@ -201,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
_config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
}
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
- if(input->data_layout() != DataLayout::NHWC)
+ if (input->data_layout() != DataLayout::NHWC)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+ .first);
}
return Status{};
@@ -232,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor();
add_1D_tensor_argument(idx, _mean, vector_slice);
add_1D_tensor_argument(idx, _var, vector_slice);
- if(_beta != nullptr)
+ if (_beta != nullptr)
{
add_1D_tensor_argument(idx, _beta, vector_slice);
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
add_1D_tensor_argument(idx, _gamma, vector_slice);
}
@@ -245,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
{
idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 743f4a9594..1a88d2a8c5 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -63,7 +65,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Set the input and output tensors.
*
@@ -81,8 +89,15 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
- const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -98,10 +113,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index da41feb7b8..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,10 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -52,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -64,14 +71,12 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+ const TensorShape expected_output_shape = compute_batch_to_space_shape(
+ input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+ const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -80,9 +85,9 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
}
} // namespace
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, block_shape, output });
+ auto padding_info = get_padding_info({input, block_shape, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
@@ -102,66 +110,83 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
_block_shape = block_shape;
_output = output;
- const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
- build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
+ _kernel = create_kernel(compile_context,
+ "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ const TensorShape output_shape = compute_batch_to_space_shape(
+ input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
_input = input;
_output = output;
- const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
- build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
- build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
+ build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
+ _kernel = create_kernel(
+ compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
return Status{};
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
return Status{};
}
@@ -170,32 +195,31 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice_in = window.first_slice_window_3D();
- Window slice_out = window.first_slice_window_4D();
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_in = window.first_slice_window_4D();
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
int batch_id = 0;
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _input, slice_in);
add_argument(idx, batch_id);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
add_1D_tensor_argument(idx, _block_shape, vector_slice);
}
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in, lws_hint());
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_in));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index 131a43e59c..b9d3e66fe2 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -52,6 +53,8 @@ public:
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
/** Initialise the kernel's inputs and output.
@@ -60,16 +63,26 @@ public:
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output);
/** Initialise the kernel's inputs and output (Static block shape).
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+ void configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
/** Initialise the kernel's inputs and output (Static block shape).
*
* @param[in] compile_context The compile context to be used.
@@ -77,8 +90,14 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -86,6 +105,8 @@ public:
* @param[in] output Tensor output. Data types supported: same as @p input
*
* @return a status
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
@@ -94,10 +115,15 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[in] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index b1f7c00fac..de3fb43de8 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,23 +27,30 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
-CLBitwiseKernel::CLBitwiseKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- if(op != BitwiseOperation::NOT)
+ if (op != BitwiseOperation::NOT)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -53,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*(output->info()), *(input1->info()));
- auto padding_info = get_padding_info({ input1, input2, output });
+ auto padding_info = get_padding_info({input1, input2, output});
// Configure kernel window
const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -65,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
// Create kernel
std::string kernel_name = "";
- switch(op)
+ switch (op)
{
case BitwiseOperation::AND:
kernel_name = "bitwise_and";
@@ -104,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input1, slice);
- if(_input2 != nullptr)
+ if (_input2 != nullptr)
{
add_2D_tensor_argument(idx, _input2, slice);
}
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643d..2c74955ae4 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
* @param[out] output Destination tensor. Data types supported: U8.
* @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 2c12275e57..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,13 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +41,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -54,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -66,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
}
- if(pred_boxes->total_size() > 0)
+ if (pred_boxes->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -84,21 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
}
} // namespace
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
- : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
}
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
- auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
- auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+ auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+ auto_init_if_empty(*pred_boxes->info(), deltas->info()
+ ->clone()
+ ->set_data_type(boxes->info()->data_type())
+ .set_quantization_info(boxes->info()->quantization_info()));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
@@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
- if(is_quantized)
+ if (is_quantized)
{
build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
// Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
const unsigned int num_elems_processed_per_iteration = 4;
- Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a..9a1bb49bb9 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
*
@@ -85,7 +92,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index c969792c3e..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
- const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int channels =
+ input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ num_groups == channels,
+ "Channel shuffling with same number of groups as number of channels would be inefficient");
// There cannot be more groups than channels
ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+ "The number of channels must be a multiple of the number of groups");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -66,28 +74,42 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const unsigned int num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
- constexpr unsigned int num_elems_processed_per_iteration_y = 2;
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ if (is_nhwc)
+ {
+ unsigned int num_elems_processed_per_iteration_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+ return std::make_pair(Status{}, win_collapsed);
+ }
+ else
+ {
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / input->element_size();
+ constexpr unsigned int num_elems_processed_per_iteration_y = 2;
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ // Configure kernel window
+ Window win = calculate_max_window(
+ *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
- const bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
+ const bool window_changed = update_window_and_padding(win, input_access, output_access);
- Window win_collapsed = win.collapse(win, Window::DimZ);
+ Window win_collapsed = win.collapse(win, Window::DimZ);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+ }
}
} // namespace
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
@@ -95,27 +117,42 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
}
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-
const DataLayout data_layout = input->info()->data_layout();
const bool is_nhwc = data_layout == DataLayout::NHWC;
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const unsigned int vec_size = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
+ const unsigned int channels =
+ input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ unsigned int vec_size_x = 0;
+ unsigned int vec_size_x_leftovers = 0;
+ if (is_nhwc)
+ {
+ vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ }
+ else
+ {
+ vec_size_x = max_cl_vector_width / input->info()->element_size();
+ }
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(is_nhwc, "-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+ build_opts.add_option_if(is_nhwc, "-DSRC_DIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
// Create kernel
@@ -146,9 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
_config_id += support::cpp11::to_string(output->info()->dimension(1));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(2));
+ if (data_layout == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
}
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e..43c939ebd8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups);
/** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
*
* @param[in] input Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
deleted file mode 100644
index 44b8471725..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
-
- constexpr unsigned int num_elems_read_per_iteration = 8;
-
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
-
- // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
- AccessWindowHorizontal input_access(input, 0, num_elems_read_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access);
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLCol2ImKernel::CLCol2ImKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, convolved_dims, num_groups);
-}
-
-void CLCol2ImKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims, num_groups));
-
- _input = input;
- _output = output;
- _convolved_dims = convolved_dims;
-
- const DataType data_type = input->info()->data_type();
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
- build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
- build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-
- _kernel = create_kernel(compile_context, "col2im", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims, num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "col2im_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(num_groups);
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims, num_groups).first);
- return Status{};
-}
-
-void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- bool is_collapsed = false;
- bool is_collapsed_out = false;
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
- Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
-
- ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_out = collapsed_out.first_slice_window_4D();
- do
- {
- // Set inputs
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.h b/src/core/CL/kernels/CLCol2ImKernel.h
deleted file mode 100644
index 710e048bca..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
-#define ARM_COMPUTE_CLCOL2IMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CLCol2ImKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLCol2ImKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLCol2ImKernel(const CLCol2ImKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLCol2ImKernel(CLCol2ImKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
- /** Default destructor */
- ~CLCol2ImKernel() = default;
- /** Set the input and output of the kernel.
- *
- * @param[in] input The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
- * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
- * @param[in] convolved_dims Output convolved dimensions.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
- */
- void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
- /** Set the input and output of the kernel.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
- * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
- * @param[in] convolved_dims Output convolved dimensions.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
- *
- * @param[in] input The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
- * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
- * @param[in] convolved_dims Output convolved dimensions.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
- const ICLTensor *_input;
- ICLTensor *_output;
- Size2D _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index e2aee36bd8..a0f9aca54a 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,22 +40,16 @@ namespace arm_compute
namespace
{
// Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
- { ComparisonOperation::Equal, "EQUAL" },
- { ComparisonOperation::NotEqual, "NOTEQUAL" },
- { ComparisonOperation::Greater, "GREATER" },
- { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
- { ComparisonOperation::Less, "LESS" },
- { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+ {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"},
+ {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+ {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"},
};
-int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
-{
- return 16 / input.element_size();
-}
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo &input1,
+ const ITensorInfo &input2,
+ const ITensorInfo &output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -63,7 +60,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured output
- if(output.total_size() > 0)
+ if (output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -75,45 +72,37 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input1.element_size(), output.dimension(0));
// Auto initialize output if not initialized
auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
- AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+ Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
}
} // namespace
-CLComparisonKernel::CLComparisonKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
}
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -129,17 +118,29 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
const std::string &operation_name = supported_comparison_ops.at(operation);
std::string kernel_name = "compare_" + lower_string(operation_name);
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input1->info()->element_size(), output->info()->dimension(0));
+
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.emplace(
+ "-DVEC_SIZE_IN1=" + //
+ support::cpp11::to_string(input1->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+ build_opts.emplace(
+ "-DVEC_SIZE_IN2=" + //
+ support::cpp11::to_string(input2->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
build_opts.emplace("-DOP=" + operation_name);
build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
- if(is_data_type_quantized(input1->info()->data_type()))
+ if (is_data_type_quantized(input1->info()->data_type()))
{
const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
+ build_opts.emplace("-DIS_QUANTIZED");
build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -163,12 +164,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
_config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
}
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
@@ -184,17 +189,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
bool can_collapse = true;
const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
{
can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
{
can_collapse = (in_shape1[d] == in_shape2[d]);
}
}
bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -215,16 +221,7 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
-BorderSize CLComparisonKernel::border_size() const
-{
- const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
-
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190183..2fb4ba06b6 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -64,7 +65,11 @@ public:
* @param[out] output Destination tensor. Data types supported: U8.
* @param[in] operation Comparison operation to use.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation);
/** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
*
* @param[in] input1 Source tensor. Data types supported: All.
@@ -74,11 +79,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
private:
const ICLTensor *_input1; /**< Source tensor 1 */
@@ -86,4 +93,4 @@ private:
ICLTensor *_output; /**< Destination tensor */
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index dcf4e6662e..0000000000
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-CLConvertFullyConnectedWeightsKernel::CLConvertFullyConnectedWeightsKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialisation if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
-
- auto padding_info = get_padding_info({ input, output });
-
- ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
-
- _input = input;
- _output = output;
-
- const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
- const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
- const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
- const unsigned int num_channels = original_input_shape[channel_idx];
-
- const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
- const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
- build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
- build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-void CLConvertFullyConnectedWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, window);
- add_2D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d1da793df2..0000000000
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- * - It follows a Convolution layer
- * - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CLConvertFullyConnectedWeightsKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLConvertFullyConnectedWeightsKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLConvertFullyConnectedWeightsKernel(const CLConvertFullyConnectedWeightsKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLConvertFullyConnectedWeightsKernel &operator=(const CLConvertFullyConnectedWeightsKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLConvertFullyConnectedWeightsKernel(CLConvertFullyConnectedWeightsKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLConvertFullyConnectedWeightsKernel &operator=(CLConvertFullyConnectedWeightsKernel &&) = default;
- /** Default destructor */
- ~CLConvertFullyConnectedWeightsKernel() = default;
- /** Set the input and output tensor.
- *
- * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
- * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
- * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
- /** Set the input and output tensor.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
- * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
- * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
- /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
- *
- * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
- * @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input.
- * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index d28cffa05f..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,9 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -36,9 +38,11 @@ namespace arm_compute
CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
: _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
const PadStrideInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -58,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
- for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
@@ -66,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
return Status{};
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -96,7 +101,6 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compi
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
@@ -118,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- switch(_data_layout)
+ switch (_data_layout)
{
case DataLayout::NCHW:
{
@@ -136,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+ } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
break;
}
case DataLayout::NHWC:
@@ -155,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
break;
}
default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341..762989a836 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
* @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
*
* @param[in] input Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index ca7e9d4b23..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,9 +27,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
- if(!is_qasymm)
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S32);
+ if (!is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+ weights_info->dimension(idx_b));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
- if(bias != nullptr)
+ if (bias != nullptr)
{
- if(is_qasymm)
+ if (is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+ stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+ ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims =
+ deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+ auto_init_if_empty(*output, input->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_layout(data_layout)
+ .set_quantization_info(input->quantization_info()));
Window win = calculate_max_window(*input);
@@ -109,28 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
}
} // namespace
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
- : _add_bias(false),
- _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
}
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
- const ITensorInfo *weights_info,
- const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+ output->info(), input_info, weights_info, deconv_info));
- auto padding_info = get_padding_info({ input, bias, output });
+ auto padding_info = get_padding_info({input, bias, output});
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+ auto win_config =
+ validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
const DataLayout data_layout = input_info->data_layout();
@@ -177,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -193,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, collapsed);
add_3D_tensor_argument(idx, _output, collapsed);
- if(_add_bias)
+ if (_add_bias)
{
add_1D_tensor_argument(idx, _bias, collapsed);
}
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f..8f436b07e3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
* @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+ void configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
/** Initialise the kernel's source and destination.
*
* @param[in] compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
* @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
- const PadStrideInfo &deconv_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel.
*
@@ -93,7 +103,12 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
deleted file mode 100644
index c98d66f390..0000000000
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input == output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
- 1,
- DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), "Input and output data types must be different");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
- ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
-}
-
-void CLDepthConvertLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _input = input;
- _output = output;
-
- // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
- set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
-
- auto padding_info = get_padding_info({ input, output });
-
- // Get data sizes
- const size_t input_size = data_size_from_type(input->info()->data_type());
- const size_t output_size = data_size_from_type(output->info()->data_type());
-
- // Get number of elements to process per iterations
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
- build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
- build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || is_data_type_float(output->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
- build_opts.add_option_if(is_data_type_quantized(input->info()->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
-
- // Create kernel
- const std::string kernel_name = (input_size >= output_size) ? "convert_depth_down" : "convert_depth_up";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set shift arg
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg(idx++, shift);
-
- // Since we have a leftover vector size calculated using the input tensor shape, it is required to
- // have the input region equal to the tensor shape
- ValidRegion input_valid_region = input->info()->valid_region();
- input->info()->set_valid_region(ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
-
- // Configure kernel
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- // Collapse window
- const Window &full_window = window();
- Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
- ICLKernel::configure_internal(collapsed_window);
-
- // Restore the valid region
- input->info()->set_valid_region(input_valid_region);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
-
- return Status{};
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.h b/src/core/CL/kernels/CLDepthConvertLayerKernel.h
deleted file mode 100644
index 8b511c6707..0000000000
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLSimple3DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth conversion kernel. */
-class CLDepthConvertLayerKernel : public ICLSimple3DKernel
-{
-public:
- /** Set the input and output of the kernel.
- *
- * Valid conversions Input -> Output :
- *
- * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
- * - U8 -> S8, U16, S16, U32, S32, F16, F32
- * - U16 -> U8, S8, S16, U32, S32, F16, F32
- * - S16 -> U8, S8, U16, U32, S32, F16, F32
- * - U32 -> U8, S8, U16, S16, S32, F16, F32
- * - S32 -> U8, S8, U16, S16, U32, F16, F32
- * - F16 -> U8, S8, U16, S16, U32, F32
- * - F32 -> U8, S8, U16, S16, U32, F16
- *
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
- * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy
- * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
- */
- void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
- /** Set the input and output of the kernel.
- *
- * Valid conversions Input -> Output :
- *
- * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
- * - U8 -> S8, U16, S16, U32, S32, F16, F32
- * - U16 -> U8, S8, S16, U32, S32, F16, F32
- * - S16 -> U8, S8, U16, U32, S32, F16, F32
- * - U32 -> U8, S8, U16, S16, S32, F16, F32
- * - S32 -> U8, S8, U16, S16, U32, F16, F32
- * - F16 -> U8, S8, U16, S16, U32, F32
- * - F32 -> U8, S8, U16, S16, U32, F16
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
- * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy
- * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
- *
- * @param[in] input Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
- * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy
- * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 8946f2a713..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -48,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -62,9 +66,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -72,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape =
+ compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
@@ -96,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
@@ -135,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_in, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_in));
+ } while (window.slide_window_slice_3D(slice_in));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b569..cef70c4dda 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -61,7 +62,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
deleted file mode 100644
index ba7a782bf1..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
-
- if(biases != nullptr)
- {
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON((biases->dimension(0) != weights->dimension(2)) && (weights->dimension(2) != 1 || biases->dimension(0) != weights->dimension(3)));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- }
-
- if(output->total_size() != 0)
- {
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
-{
- // Output auto inizialitation if not yet initialized
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
- const unsigned int conv_stride_x = conv_info.stride().first;
- const unsigned int conv_stride_y = conv_info.stride().second;
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
- // Configure kernel window
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- if(input->data_type() == DataType::F16)
- {
- kernel_name = "depthwise_convolution_3x3_f16";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_y = 3;
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 8;
- break;
- case 2:
- num_elems_read_per_iteration_x = 9;
- break;
- case 3:
- num_elems_read_per_iteration_x = 16;
- break;
- default:
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
- break;
- }
- if(is_bifrost)
- {
- if(conv_stride_x == 1 && conv_stride_y == 1)
- {
- kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16";
- num_elems_read_per_iteration_x = 8;
- num_elems_written_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_y = 4;
- }
- else if(conv_stride_x == 2 && conv_stride_y == 2)
- {
- kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16";
- num_elems_read_per_iteration_x = 10;
- num_elems_written_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_y = 2;
- }
- }
- }
- else if(input->data_type() == DataType::F32 && is_bifrost)
- {
- if(conv_stride_x == 1 && conv_stride_y == 1)
- {
- kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32";
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 4;
- }
- else if(conv_stride_x == 2 && conv_stride_y == 2)
- {
- kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32";
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 2;
- }
- else
- {
- kernel_name = "depthwise_convolution_3x3";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
- num_elems_read_per_iteration_y = 3;
- }
- }
- else
- {
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type());
-
- kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3";
- kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
- kernel_name += (is_qasymm ? "_nchw" : "");
-
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x + (conv_stride_x > 1 ? 1 : 0);
- num_elems_read_per_iteration_y = num_elems_written_per_iteration_y + 2;
- }
- // The OpenCL routine convolution1x3 does loadn(addr), loadn(addr + dilation_x) and loadn(addr + 2 * dilation_x) on the input.
- // Each of the three convolution1x3 gets called by passing addr, (addr + dilation_y) and (addr + 2 * dilation_y)
- // Hence we must add 2 * dilation.x/y() to the number of elements read in those axes per thread
- num_elems_read_per_iteration_x += 2 * dilation.x();
- num_elems_read_per_iteration_y += 2 * dilation.y();
-
- // Create window and update padding
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
- num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
- conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
- bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
- : _conv_stride_x(0), _conv_pad_top(0), _conv_pad_left(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, act_info, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _conv_stride_x = conv_info.stride().first;
- _conv_stride_y = conv_info.stride().second;
- _conv_pad_left = conv_info.pad_left();
- _conv_pad_top = conv_info.pad_top();
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- // Configure kernel window
- std::string kernel_name;
- const GPUTarget gpu_target = get_target();
-
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- _border_size = BorderSize(input->info()->padding());
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
- build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
- build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
- if(act_info.enabled())
- {
- int a_val{};
- int b_val{};
- std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
-
- const int o1 = oq_info.offset;
-
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
- }
- else
- {
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
- }
-
- build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
- build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target,
- const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- std::string kernel_name;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(),
- conv_info, depth_multiplier, gpu_target, kernel_name, dilation)
- .first);
-
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Create input window and adjust
- Window collapsed_in = collapsed;
- collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
- collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
- collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
- collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
-
- Window slice_in = collapsed_in.first_slice_window_3D();
- Window slice_out = collapsed.first_slice_window_3D();
- Window slice_weights = window.first_slice_window_3D();
- slice_weights.set_dimension_step(Window::DimX, 0);
- slice_weights.set_dimension_step(Window::DimY, 0);
-
- unsigned int idx = 3 * num_arguments_per_3D_tensor();
-
- // Set output multipliers in case of quantized data type
- if(_is_quantized)
- {
- Window slice;
- slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
- add_1D_tensor_argument(idx, _output_multipliers, slice);
- add_1D_tensor_argument(idx, _output_shifts, slice);
- }
-
- // Set biases
- if(_biases != nullptr)
- {
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- add_1D_tensor_argument(idx, _biases, slice_biases);
- }
-
- do
- {
- idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_weights);
-
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
deleted file mode 100644
index 45b5869676..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-
-#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
- */
-class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
- /** Default constructor */
- CLDepthwiseConvolutionLayer3x3NCHWKernel();
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
- *
- * @param[in] input Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
- * @param[in] gpu_target (Optional) GPU target to validate the kernel for. Defaults to midgard.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
- const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- unsigned int _conv_stride_x;
- unsigned int _conv_pad_top;
- unsigned int _conv_pad_left;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
deleted file mode 100644
index d13afd2010..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
-
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 4);
-
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const size_t weights_width = 3;
- const size_t weights_height = 3;
-
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
- *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
- if(is_qasymm)
- {
- DepthwiseConvolutionReshapeInfo info;
- info.c0 = 4;
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_shifts->dimension(0));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
- }
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]);
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_UNUSED(weights);
- ARM_COMPUTE_UNUSED(depth_multiplier);
-
- const bool is_stride_1_dilation_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
- unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
- Window win{};
- Status err{};
-
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- const unsigned int num_elems_accessed_per_iteration = 4;
- const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2;
- const unsigned int num_rows_written_per_iteration = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
-
- BorderSize border_size;
- border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
-
- // Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
-
- AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
- ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
-
- bool window_changed = false;
-
- if((output_multipliers != nullptr) && (output_shifts != nullptr))
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_accessed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access, output_access, output_multipliers_access, output_shifts_access);
- }
- else
- {
- Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
- return std::make_pair(err, win);
- }
-
- if(bias != nullptr)
- {
- AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- }
- else
- {
- unsigned int num_elems_accessed_per_iteration = adjust_vec_size(4 / input->element_size(), input->dimension(0));
- win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_processed_per_iteration));
- }
-
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
- : _num_planes_processed_per_iteration(1)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, act_info, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
- auto padding_info = get_padding_info({ input, weights, biases, output });
-
- auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr);
-
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _conv_stride_y = conv_info.stride().second;
- _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- if(_is_quantized)
- {
- _border_size = BorderSize(input->info()->padding());
-
- // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
- if(is_dot8_supported)
- {
- _num_planes_processed_per_iteration = 1;
- }
- }
-
- unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : adjust_vec_size(4 / input->info()->element_size(), input->info()->dimension(0));
- unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
- build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_accessed_per_iteration));
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
- build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
- "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
-
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
- build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
- if(act_info.enabled())
- {
- int a_val{};
- int b_val{};
- std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
-
- const int o1 = oq_info.offset;
-
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
-
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
- }
- else
- {
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- }
-
- if(is_stride_1_dilation_1)
- {
- build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(num_rows_processed_per_iteration));
- build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
- build_opts.add_option("-DDST_DIM_1=" + support::cpp11::to_string(_output->info()->dimension(1)));
- build_opts.add_option("-DDST_DIM_2=" + support::cpp11::to_string(_output->info()->dimension(2)));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string((input->info()->dimension(1) + conv_info.pad_left() + conv_info.pad_right()) % num_rows_processed_per_iteration));
- }
- else
- {
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- }
-
- std::string kernel_name;
- // Create kernel
- if(_is_quantized)
- {
- kernel_name = std::string("dwc_3x3_reshaped_quantized8");
- kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : "");
- kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
- kernel_name += "_nhwc";
- }
- else
- {
- kernel_name = std::string("depthwise_convolution_3x3_nhwc");
- kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
- }
-
- ICLKernel::configure_internal(win_config.second);
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- ARM_COMPUTE_ERROR_ON(!_is_quantized && has_padding_changed(padding_info));
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += string_from_data_type(input->info()->data_type());
-}
-
-Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
- biases != nullptr ? biases->clone().get() : nullptr,
- output->clone().get(), conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr,
- (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr)
- .first);
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
-
- Window win = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-
- if(_is_quantized)
- {
- Window slice;
- slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
- slice.set_dimension_step(Window::DimX, window.x().step());
- add_1D_tensor_argument(idx, _output_multipliers, slice);
- add_1D_tensor_argument(idx, _output_shifts, slice);
- }
-
- if(_biases != nullptr)
- {
- Window win_biases;
- win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- win_biases.set_dimension_step(Window::DimX, window.x().step());
- add_1D_tensor_argument(idx, _biases, win_biases);
- }
-
- if(_is_quantized)
- {
- // Calculate the max_offset.
- // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
- // |******************|
- // | pad_top |
- // |******************|
- // | |
- // | plane0 |
- // | batch0 |
- // |__________________|
- // |******************| Batch 0
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane1 |
- // | batch0 |
- // |__________________|-----> max_offset
- // |******************|
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane0 |
- // | batch1 |
- // |__________________|
- // |******************| Batch 1
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane1 |
- // | batch1 |
- // |__________________|
- // | pad_bottom |
- // |******************|
- const int max_offset = ((_input->info()->dimension(1) * _input->info()->dimension(2)) + (_input->info()->padding().bottom + _input->info()->padding().top) * (_input->info()->dimension(
- 2) - 1)) * _input->info()->strides_in_bytes().y();
- _kernel.setArg(idx, max_offset);
- }
-
- Window slice = win.first_slice_window_4D();
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice);
- if(_is_quantized)
- {
- add_2D_tensor_argument(idx, _weights, slice);
- }
- else
- {
- add_3D_tensor_argument(idx, _weights, slice);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(win.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
deleted file mode 100644
index ce0bf5ceb3..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-
-#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
- */
-class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
- /** Default constructor */
- CLDepthwiseConvolutionLayer3x3NHWCKernel();
- /** Default move assignment operator. */
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
- *
- * @param[in] input Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- unsigned int _num_planes_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index c34018a000..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,46 +28,94 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
ARM_COMPUTE_UNUSED(dwc_info);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ bool in_place = false;
+ if (output == nullptr || output == input)
+ {
+ in_place = true;
+ output = input;
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1);
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+ (export_to_cl_image(weights) == false),
+ "Weights cannot be exported to cl_image!");
+ ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(idx_c);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
+
+ // In place restrictions
+ if (in_place)
+ {
+ const int weights_width_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const int weights_height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+ weights->tensor_shape()[weights_height_idx] != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ conv_info.pad_stride_info
+ .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+ }
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+ conv_info.dilation};
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+
+ if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+ }
const bool is_quantized = is_data_type_quantized(input->data_type());
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -77,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
}
}
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -85,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -103,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- if(is_data_type_quantized(input->data_type()))
+ if (is_data_type_quantized(input->data_type()))
{
const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+ const UniformQuantizationInfo oq_info =
+ (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
}
return Status{};
@@ -133,110 +183,194 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
_depth_multiplier(1),
_output_multipliers(nullptr),
_output_shifts(nullptr),
+ _export_input_to_cl_image(false),
+ _export_weights_to_cl_image(false),
_is_quantized(false)
{
+ _type = CLKernelType::DEPTHWISE;
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+ output_multipliers, output_shifts);
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ if (output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+ conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+ (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+
+ auto padding_info = get_padding_info({input, output});
+
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *(input->info()), *(weights->info()), conv_info);
+ auto_init_if_empty(*(output->info()), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_quantization_info(output->info()->quantization_info()));
+
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _biases = biases;
+ _depth_multiplier = conv_info.depth_multiplier;
+ _output_multipliers = output_multipliers;
+ _output_shifts = output_shifts;
+ _export_input_to_cl_image = dwc_info.export_input_to_cl_image;
+ _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image;
+ _is_quantized = is_data_type_quantized(input->info()->data_type());
+
+ const unsigned int n0 = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
+ const unsigned int m0 = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
+ std::string kernel_name = "";
- auto padding_info = get_padding_info({ input, output });
+ CLBuildOptions build_opts;
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
- auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+ // Update the padding for the input/weights tensor if we can export to cl_image
+ if (_export_input_to_cl_image)
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
+ }
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _depth_multiplier = depth_multiplier;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
+ if (_export_weights_to_cl_image)
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
+ }
- const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0));
+ // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+ const GPUTarget gpu_target = get_target();
+ const auto act_function = conv_info.act_info.activation();
+ const auto dst_data_type = _output->info()->data_type();
- CLBuildOptions build_opts;
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
- build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(_output->info()->dimension(2))));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
- build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_opts.add_option("-cl-unsafe-math-optimizations");
+ }
+ else
+ {
+ build_opts.add_option("-cl-fast-relaxed-math");
+ }
+
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+ build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
+ build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+ // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
+ // have a case where the data types for the activation and weights are different. However, since the implementation
+ // only works when both have same data type, we have to change the offset to take into account this aspect
+ build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+ build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+ build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
+ build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
+ build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
+ build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+ build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_top()));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_left()));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().first));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().second));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(conv_info.dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1)));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2)));
- build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
-
- std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
-
- if(_is_quantized)
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
+ build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+ "-DN0_A=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
+ build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+ // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+ set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+ static_cast<int>(_weights->info()->dimension(1)),
+ static_cast<int>(_weights->info()->dimension(2))});
+
+ if (biases != nullptr)
{
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+ build_opts.add_option(std::string("-DHAS_BIAS"));
+ build_opts.add_option(
+ std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+ }
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
+ if (_is_quantized)
+ {
+ kernel_name = "dwc_native_quantized_nhwc";
+ const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+ int zero_value_s32;
+ zero_value.get(zero_value_s32);
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
int output_multiplier = 0;
int output_shift = 0;
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
- if(dwc_info.activation_info.enabled())
- {
- int a_val{};
- int b_val{};
- std::tie(b_val, a_val) = get_quantized_activation_min_max(dwc_info.activation_info, input->info()->data_type(), oq_info);
-
- const int o1 = oq_info.offset;
-
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
+ build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_opts.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_opts.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+ build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+ build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+ build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+ "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+ // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+ int a_val{};
+ int b_val{};
+ std::tie(b_val, a_val) =
+ get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+
+ build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
}
else
{
- build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.a()));
- build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
+ kernel_name = "dwc_native_fp_nhwc";
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
}
- Window win = calculate_max_window(*(output->info()), Steps(n0));
+ Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
ICLKernel::configure_internal(win);
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -261,11 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
_config_id += string_from_data_type(input->info()->data_type());
}
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
return Status{};
}
@@ -276,37 +416,61 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
// Collapse window
Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
- Window slice_in = window.first_slice_window_4D();
- Window slice_out = window_collapsed.first_slice_window_4D();
- if(_depth_multiplier != 1)
- {
- ARM_COMPUTE_ERROR_ON(slice_out.x().step() != 1);
- slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
- }
+ Window slice = window_collapsed.first_slice_window_4D();
- unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+ cl::Image2D input_cl_image;
+ cl::Image2D weights_cl_image;
- // Set output multipliers in case of quantized data type
- if(_is_quantized)
+ if (_export_input_to_cl_image || _export_weights_to_cl_image)
{
- add_1D_tensor_argument(idx, _output_multipliers, slice_in);
- add_1D_tensor_argument(idx, _output_shifts, slice_in);
+ // Export cl_buffer to cl_image
+ if (_export_input_to_cl_image)
+ {
+ const size_t image_w = _input->info()->dimension(0) / 4;
+ const size_t image_h =
+ _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = _input->info()->strides_in_bytes()[1];
+ input_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+ _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ if (_export_weights_to_cl_image)
+ {
+ const size_t image_w = _weights->info()->dimension(0) / 4;
+ const size_t image_h =
+ _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1];
+ weights_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+ _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
}
- if(_biases != nullptr)
+ unsigned int idx = 0;
+ if (_export_input_to_cl_image)
{
- add_1D_tensor_argument(idx, _biases, slice_in);
+ _kernel.setArg(idx++, input_cl_image);
}
-
- do
+ add_4d_tensor_nhwc_argument(idx, _input);
+ add_4d_tensor_nhwc_argument(idx, _output);
+ if (_export_weights_to_cl_image)
+ {
+ _kernel.setArg(idx++, weights_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, _weights);
+ if (_is_quantized)
+ {
+ add_1D_tensor_argument(idx, _output_multipliers, slice);
+ add_1D_tensor_argument(idx, _output_shifts, slice);
+ }
+ if (_biases != nullptr)
{
- idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
+ add_1D_tensor_argument(idx, _biases, slice);
}
- while(window_collapsed.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+ enqueue(queue, *this, slice, lws_hint());
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 325f4e7067..d34a662966 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,10 @@
#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
@@ -47,85 +48,84 @@ public:
CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
/** Allow instances of this class to be moved */
CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
/** Initialize the function's source, destination and parameters
*
+ * @param[in] compile_context The compile context to be used.
* @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
* @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
* Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
* @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
* Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
* @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] conv_info Convolution info (padding, stride, dilation, ...)
* @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
* the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
* @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
* the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ *
+ * @note: In-place is only supported when
+ * * data layout: NHWC
+ * * filter: 1x1
+ * * @p depth_multiplier: 1
+ * * strides: 1
+ * * dilation: 1
+ * * no padding
+ * * no change of data layout after configure
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
- /** Initialize the function's source, destination and parameters
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
+
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
- * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Data type supported: Same as @p input.
- * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
- * @param[in] dwc_info Depthwise convolution layer info
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
- const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers = nullptr,
+ const ITensorInfo *output_shifts = nullptr);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input;
- const ICLTensor *_weights;
- const ICLTensor *_biases;
- ICLTensor *_output;
- unsigned int _depth_multiplier;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _is_quantized;
+ const ICLTensor *_input{};
+ const ICLTensor *_weights{};
+ const ICLTensor *_biases{};
+ ICLTensor *_output{};
+ unsigned int _depth_multiplier{0};
+ const ICLTensor *_output_multipliers{};
+ const ICLTensor *_output_shifts{};
+ bool _export_input_to_cl_image{false};
+ bool _export_weights_to_cl_image{true};
+ bool _is_quantized{false};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
deleted file mode 100644
index b10c23bde9..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), reshaped_weights_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- auto reshaped_input_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
- auto_init_if_empty(*output, reshaped_input_shape, 1, input->data_type(), input->quantization_info());
-
- Window win = calculate_max_window(*input, Steps(info.c0));
- AccessWindowHorizontal weights_access(input, 0, info.c0);
- const bool window_changed = update_window_and_padding(win, weights_access);
-
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayerReshapeWeightsKernel::CLDepthwiseConvolutionLayerReshapeWeightsKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
- auto win_config = validate_and_configure_window(input->info(), output->info(), info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- ICLKernel::configure_internal(win_config.second);
-
- _input = input;
- _output = output;
-
- // Build the kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0));
- build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0)));
- build_opts.add_option_if(info.transpose, "-DTRANSPOSE");
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
- _kernel = create_kernel(compile_context, "depthwise_convolution_reshape_weights", build_opts.options());
-}
-
-Status CLDepthwiseConvolutionLayerReshapeWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info).first);
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, window);
- add_2D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
deleted file mode 100644
index 650fe9a11b..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to reshape the weights of depthwise convolution. */
-class CLDepthwiseConvolutionLayerReshapeWeightsKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLDepthwiseConvolutionLayerReshapeWeightsKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerReshapeWeightsKernel(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
- /** Default Move Constructor. */
- CLDepthwiseConvolutionLayerReshapeWeightsKernel(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
- /** Default move assignment operator */
- CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-
- /** Initialize the function's source and destination.
- *
- * @param[in] input The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
- * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
- * @param[in] info Depthwise convolution information to reshape the input tensor.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
- /** Initialize the function's source and destination.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
- * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
- * @param[in] info Depthwise convolution information to reshape the input tensor.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
- *
- * @param[in] input The input tensor info of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
- * @param[in] output The output tensor info of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
- * @param[in] info Depthwise convolution information to reshape the input tensor.
- *
- * @return a Status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-
- void configure_dot_product(const DepthwiseConvolutionReshapeInfo &info);
- void configure_generic(const DepthwiseConvolutionReshapeInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H */
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
deleted file mode 100644
index 3723c651fe..0000000000
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
- if(output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-CLDequantizationLayerKernel::CLDequantizationLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32);
-
- auto padding_info = get_padding_info({ input, output });
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const int vec_size_x = 16 / output->info()->element_size();
- const int output_width_x = output->info()->tensor_shape().x();
- const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(input->info()->data_type());
- std::string kernel_name = "dequantization_layer";
-
- // Create kernel
- CLBuildOptions build_opts;
- if(!is_quantized_per_channel)
- {
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const int qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0;
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
- }
- else
- {
- kernel_name += "_per_channel";
- kernel_name += input->info()->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
- }
-
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
- // Create kernel name
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
- if(multi_access_x)
- {
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- return Status{};
-}
-
-void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(_input->info()->data_type());
-
- // Collapse windo
- Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = new_window.first_slice_window_3D();
-
- if(is_quantized_per_channel)
- {
- unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
- _kernel.setArg(idx++, _input->quantization().scale->cl_buffer());
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.h b/src/core/CL/kernels/CLDequantizationLayerKernel.h
deleted file mode 100644
index 5579b5bc71..0000000000
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dequantization layer kernel. */
-class CLDequantizationLayerKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLDequantizationLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDequantizationLayerKernel(const CLDequantizationLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDequantizationLayerKernel &operator=(const CLDequantizationLayerKernel &) = delete;
- /** Default Move Constructor. */
- CLDequantizationLayerKernel(CLDequantizationLayerKernel &&) = default;
- /** Default move assignment operator */
- CLDequantizationLayerKernel &operator=(CLDequantizationLayerKernel &&) = default;
- /** Default destructor */
- ~CLDequantizationLayerKernel() = default;
- /** Set the input, output, min and max.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[out] output Destination tensor. Data types supported: F16/F32.
- */
- void configure(const ICLTensor *input, ICLTensor *output);
- /** Set the input, output, min and max.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[out] output Destination tensor. Data types supported: F16/F32.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[in] output Output tensor info. Data types supported: F16/F32.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
deleted file mode 100644
index 2fc3c60f67..0000000000
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-
- const DataLayout data_layout = input->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
- "Weights feature map dimension should match the respective input's one");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
- && std::get<0>(conv_info.stride()) > 2,
- "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
-
- if(data_layout == DataLayout::NCHW)
- {
- if(is_data_type_quantized(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
- "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
- "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
- }
- }
-
- if(biases != nullptr)
- {
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
- "Biases size and number of input feature maps should match");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
- "Biases should be one dimensional");
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- const auto data_type = input->data_type();
- if(is_data_type_quantized(data_type))
- {
- const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- }
- return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
- DataType data_type, DataLayout data_layout)
-{
- return gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
- GPUTarget::G52, GPUTarget::G52LIT)
- && (kernel_size <= 5)
- && (conv_stride_x == 1) && (conv_stride_y == 1)
- && (data_type == DataType::F32)
- && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
- unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
- unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
-{
- const DataType data_type = input->data_type();
- const DataLayout data_layout = input->data_layout();
- unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
- const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
- if(run_optimized_bifrost)
- {
- // Configure kernel window
- switch(kernel_size)
- {
- case 1:
- {
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 4;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 4;
- break;
- }
- case 3:
- {
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 3;
- break;
- }
- case 5:
- {
- num_elems_read_per_iteration_x = 8;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 2;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
- }
- }
- }
- else
- {
- num_elems_read_per_iteration_y = kernel_size;
- num_elems_written_per_iteration_x = 8;
- num_elems_written_per_iteration_y = 1;
- switch(kernel_size)
- {
- case 1:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 8;
- break;
- case 2:
- num_elems_read_per_iteration_x = 16;
- break;
- case 3:
- switch(input->element_size())
- {
- case 1:
- num_elems_read_per_iteration_x = 28;
- break;
- case 2:
- num_elems_read_per_iteration_x = 24;
- break;
- case 4:
- num_elems_read_per_iteration_x = 22;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid data size");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 3:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 10;
- break;
- case 2:
- num_elems_read_per_iteration_x = 17;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 5:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 12;
- break;
- case 2:
- num_elems_read_per_iteration_x = 20;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 9:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 16;
- break;
- case 2:
- num_elems_read_per_iteration_x = 24;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid direct convolution size");
- }
- }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
-{
- const DataLayout data_layout = input->data_layout();
-
- // Get output shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, output_shape,
- 1,
- input->data_type(),
- input->quantization_info());
-
- if(data_layout == DataLayout::NHWC)
- {
- const unsigned int vec_size = std::min(static_cast<unsigned int>(output->tensor_shape()[0]), 4u);
-
- // Create window and update padding
- Window win = calculate_max_window(*output, Steps(vec_size, 1U));
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = Status{};
- return std::make_pair(err, win);
- }
- else if(data_layout == DataLayout::NCHW)
- {
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int kernel_size = weights->dimension(width_idx);
-
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- unsigned int conv_pad_left = conv_info.pad_left();
- unsigned int conv_pad_top = conv_info.pad_top();
- unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
- setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
- num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
- kernel_size, conv_info, target, input);
-
- // Create window and update padding
- bool window_changed = false;
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-}
-} // namespace
-
-CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
- : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_info()
-{
-}
-
-BorderSize CLDirectConvolutionLayerKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
-}
-
-void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- // Perform validation
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(),
- conv_info));
-
- _conv_stride_x = std::get<0>(conv_info.stride());
- _conv_stride_y = std::get<1>(conv_info.stride());
- _data_layout = input->info()->data_layout();
- _input = input;
- _weights = weights;
- _output = output;
- _biases = biases;
- _conv_info = conv_info;
-
- const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int kernel_size = weights->info()->dimension(width_idx);
- const DataType data_type = input->info()->data_type();
-
- const GPUTarget gpu_target = get_target();
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- std::stringstream kernel_name;
- CLBuildOptions build_options;
-
- if(_data_layout == DataLayout::NHWC)
- {
- _border_size = BorderSize();
-
- kernel_name << "direct_convolution_nhwc";
-
- const unsigned int n0 = win_config.second.x().step();
- const unsigned int m0 = win_config.second.y().step();
- const unsigned int k0 = adjust_vec_size(16u, _input->info()->dimension(channel_idx));
- const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0;
- const unsigned int partial_store_m0 = (_output->info()->dimension(width_idx) * _output->info()->dimension(height_idx)) % m0;
- const unsigned int pad_left = conv_info.pad_left();
- const unsigned int pad_top = conv_info.pad_top();
-
- if(_biases != nullptr)
- {
- build_options.add_option(std::string("-DHAS_BIAS"));
- build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(_biases->info()->data_type())));
- }
- build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx)));
- build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx)));
- build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(_input->info()->dimension(channel_idx)));
- build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
- build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)));
- build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)));
- build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->dimension(channel_idx)));
- build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type()));
- build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(width_idx)));
- build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(height_idx)));
- build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
- build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
- build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
- build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
- build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
- build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-
- if(is_data_type_quantized(data_type))
- {
- const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
- PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
- int zero_value_s32;
- zero_value.get(zero_value_s32);
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_options.add_option("-DIS_QUANTIZED");
- build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
- build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
- build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
- build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
- build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
- build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
- }
- else
- {
- build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
- build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
- build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
- }
- }
- else
- {
- _border_size = BorderSize(_input->info()->padding());
-
- kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
- build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
-
- const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
-
- if(run_optimized_for_bifrost)
- {
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-
- kernel_name << "_f32_bifrost";
- }
- else
- {
- build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
- build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
- build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
- if(is_data_type_quantized(data_type))
- {
- const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
- build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
- build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
- build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
- kernel_name.str("direct_convolution_quantized");
- }
- }
- }
-
- _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += support::cpp11::to_string(kernel_size);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().left);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().top);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().right);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().bottom);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_x);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_y);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(width_idx));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const GPUTarget target)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
-
- return Status{};
-}
-
-void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Get initial windows
- Window slice = window.first_slice_window_3D();
-
- if(_data_layout == DataLayout::NHWC)
- {
- slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1) * _output->info()->dimension(2), 1));
- slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(3), 1));
-
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_3D_tensor_argument(idx, _weights, slice);
- if(_biases != nullptr)
- {
- add_1D_tensor_argument(idx, _biases, slice);
- }
- _kernel.setArg(idx++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
- enqueue(queue, *this, slice, lws_hint());
- }
- else
- {
- Window win_in = window;
-
- win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
- win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
- const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
- win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
-
- Window slice_in = win_in.first_slice_window_3D();
- unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
- add_3D_tensor_argument(idx1, _weights, slice);
-
- if(_biases != nullptr)
- {
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- add_1D_tensor_argument(idx1, _biases, slice_biases);
- }
-
- _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
- }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
deleted file mode 100644
index 0257d0c2dd..0000000000
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the direct convolution kernel.
- */
-class CLDirectConvolutionLayerKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLDirectConvolutionLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default;
- /** Default destructor */
- ~CLDirectConvolutionLayerKernel() = default;
- /** Set the input, weights, biases and output tensors.
- *
- * @note: DirectConvolution only works in the following configurations:
- * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
- * 3x3 convolution with stride_x = 1/2, stride_y = 1/2
- * 5x5 convolution with stride_x = 1/2, stride_y = 1/2
- * 9x9 convolution with stride_x = 1/2, stride_y = 1/2
- *
- * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * The 3rd dimension must be the same as the input's volume 3rd dimension.
- * Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
- * @param[out] output Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- */
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
- /** Set the input, weights, biases and output tensors.
- *
- * @note: DirectConvolution only works in the following configurations:
- * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
- * 3x3 convolution with stride_x = 1/2, stride_y = 1/2
- * 5x5 convolution with stride_x = 1/2, stride_y = 1/2
- * 9x9 convolution with stride_x = 1/2, stride_y = 1/2
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * The 3rd dimension must be the same as the input's volume 3rd dimension.
- * Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
- * @param[out] output Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
- *
- * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * The 3rd dimension must be the same as the input's volume 3rd dimension.
- * Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[in] output Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] target Target GPU architecture.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-public:
- const ICLTensor *_input;
- const ICLTensor *_biases;
- const ICLTensor *_weights;
- ICLTensor *_output;
- DataLayout _data_layout;
- BorderSize _border_size;
- int _conv_stride_x;
- int _conv_stride_y;
- PadStrideInfo _conv_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 22eee11c8a..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -35,17 +38,20 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -55,33 +61,42 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *output,
+ ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_UNUSED(idx, config);
auto_init_if_empty(*output, input->clone()->set_num_channels(2));
Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
} // namespace
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
- : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
}
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
- auto padding_info = get_padding_info({ input, output, idx });
+ auto padding_info = get_padding_info({input, output, idx});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
_input = input;
@@ -112,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
return Status{};
}
@@ -135,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _output, slice);
add_1D_tensor_argument(idx, _idx, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22..fdd1bcc3d3 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -56,7 +56,8 @@ public:
* @param[in] idx Digit reverse index tensor. Data type supported: U32
* @param[in] config Kernel configuration.
*/
- void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+ void
+ configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
* @param[in] idx Digit reverse index tensor. Data type supported: U32
* @param[in] config Kernel configuration.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 5db3cb6bf2..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,8 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -45,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output, *input);
}
@@ -70,18 +73,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
steps.set(config.axis, config.radix);
Window win = calculate_max_window(*input, steps);
- if(output != nullptr)
- {
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
return std::make_pair(Status{}, win);
}
} // namespace
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
@@ -89,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
- auto padding_info = get_padding_info({ input, output });
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -112,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments if not the first stage
- if(!config.is_first_stage)
+ if (!config.is_first_stage)
{
const unsigned int Ni = config.Nx * config.radix;
const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_uint>(idx++, config.Nx);
_kernel.setArg<cl_uint>(idx++, Ni);
_kernel.setArg<cl_float>(idx, exp_const);
@@ -138,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const FFTRadixStageKernelInfo &config)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get(),
- config)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+ .first);
return Status{};
}
std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
{
- return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+ return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
}
void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -167,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db83..de80bfced3 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
#include <set>
namespace arm_compute
@@ -69,7 +69,10 @@ public:
* @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input
* @param[in] config FFT descriptor metadata.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index edcf5d5a5d..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -50,30 +53,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- if(output != nullptr)
- {
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
- // CLFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- }
-
- return std::make_pair(Status{}, win);
-}
} // namespace
-CLFFTScaleKernel::CLFFTScaleKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
@@ -81,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -94,20 +81,28 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+ : input->info()->num_channels()));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(config.conjugate, "-DCONJ");
std::string kernel_name = "fft_scale_conj";
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_float>(idx, config.scale);
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ if (output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
_config_id = kernel_name;
@@ -124,7 +119,6 @@ Status CLFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *o
{
ARM_COMPUTE_UNUSED(config);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
@@ -141,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193..b995282e02 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -63,7 +63,10 @@ public:
* @param[out] output Destination tensor. Data type supported: same as @p input
* @param[in] config Kernel configuration
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 840ed0ca2f..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,16 +29,18 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/WindowHelpers.h"
#include "support/Cast.h"
#include "support/StringSupport.h"
namespace arm_compute
{
-CLFillBorderKernel::CLFillBorderKernel()
- : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
bool CLFillBorderKernel::is_parallelisable() const
@@ -54,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
ICLKernel::add_argument<T>(idx, static_cast<T>(value));
}
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
}
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
_tensor = tensor;
configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
}
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
- auto padding_info = get_padding_info({ tensor });
+ auto padding_info = get_padding_info({tensor});
border_size.limit(tensor->padding());
// If there is no border: early exit
- if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+ if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
{
return;
}
@@ -96,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Create static kernel arguments
- const unsigned int valid_width = tensor->valid_region().shape[0];
- const unsigned int valid_height = tensor->valid_region().shape[1];
- const cl_int2 valid_region_coords =
- {
- {
- static_cast<cl_int>(tensor->valid_region().anchor[0]),
- static_cast<cl_int>(tensor->valid_region().anchor[1]),
- }
- };
- const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+ const unsigned int valid_width = tensor->valid_region().shape[0];
+ const unsigned int valid_height = tensor->valid_region().shape[1];
+ const cl_int2 valid_region_coords = {{
+ static_cast<cl_int>(tensor->valid_region().anchor[0]),
+ static_cast<cl_int>(tensor->valid_region().anchor[1]),
+ }};
+ const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
// Set static kernel arguments
unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
ICLKernel::add_argument<cl_uint>(idx, valid_width);
ICLKernel::add_argument<cl_uint>(idx, valid_height);
ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
- if(BorderMode::CONSTANT == border_mode)
+ if (BorderMode::CONSTANT == border_mode)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -173,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
{
// Border mode undefined or border width == 0
- if(_kernel() == nullptr)
+ if (_kernel() == nullptr)
{
return;
}
- const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ const auto tensor =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -191,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
unsigned int idx = 0;
add_3D_tensor_argument(idx, tensor, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
{
// Border mode undefined or border width == 0
- if(_kernel() == nullptr)
+ if (_kernel() == nullptr)
{
return;
}
@@ -214,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171..5782143cf9 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -57,7 +58,11 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the kernel's input, output and border mode.
*
* @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the kernel's input, output and border mode.
*
* @param[in] compile_context The compile context to be used.
@@ -74,7 +82,11 @@ public:
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Function to set the constant value on fill border kernel depending on type.
*
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 2116239080..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,27 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -53,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
- if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+ if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
{
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
}
else
{
- const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t channel_idx =
+ get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
}
// Validate bias
- if(input_bias != nullptr)
+ if (input_bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
}
// Validate beta
- if(bn_beta != nullptr)
+ if (bn_beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
}
// Validate gamma
- if(bn_gamma != nullptr)
+ if (bn_gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
}
// Validate output weights
- if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ if (fused_weights != nullptr && fused_weights->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
}
// Validate output bias
- if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ if (fused_bias != nullptr && fused_bias->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -100,27 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
} // namespace
CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
- : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
- _run_in_place_weights(false), _run_in_place_bias(false)
+ : _input_weights(nullptr),
+ _input_bias(nullptr),
+ _bn_mean(nullptr),
+ _bn_var(nullptr),
+ _bn_gamma(nullptr),
+ _bn_beta(nullptr),
+ _fused_weights(nullptr),
+ _fused_bias(nullptr),
+ _epsilon(),
+ _run_in_place_weights(false),
+ _run_in_place_bias(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
- auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+ auto padding_info =
+ get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
_input_weights = input_weights;
_input_bias = input_bias;
@@ -133,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
_epsilon = epsilon;
_run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
- _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+ _run_in_place_bias =
+ (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
// Auto initialize outputs
- if(_fused_weights != nullptr)
+ if (_fused_weights != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
}
- if(_fused_bias != nullptr)
+ if (_fused_bias != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
}
// Validate arguments
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
- (fused_weights != nullptr) ? fused_weights->info() : nullptr,
- (fused_bias != nullptr) ? fused_bias->info() : nullptr,
- (input_bias != nullptr) ? input_bias->info() : nullptr,
- (bn_beta != nullptr) ? bn_beta->info() : nullptr,
- (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
- epsilon, fbn_type));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+ fbn_type));
// Configure kernel window
Window win = calculate_max_window(*input_weights->info());
@@ -163,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
- build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+ build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+ "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -178,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
return Status{};
}
@@ -200,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
// Add kernel arguments
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input_weights, slice_3d);
- if(_input_bias != nullptr)
+ if (_input_bias != nullptr)
{
add_1D_tensor_argument(idx, _input_bias, slice_1d);
}
add_1D_tensor_argument(idx, _bn_mean, slice_1d);
add_1D_tensor_argument(idx, _bn_var, slice_1d);
- if(!_run_in_place_weights)
+ if (!_run_in_place_weights)
{
add_3D_tensor_argument(idx, _fused_weights, slice_3d);
}
- if(!_run_in_place_bias)
+ if (!_run_in_place_bias)
{
add_1D_tensor_argument(idx, _fused_bias, slice_1d);
}
- if(_bn_beta != nullptr)
+ if (_bn_beta != nullptr)
{
add_1D_tensor_argument(idx, _bn_beta, slice_1d);
}
- if(_bn_gamma != nullptr)
+ if (_bn_gamma != nullptr)
{
add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
}
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab..76ec7a759f 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Set the source, destination of the kernel
*
* @param[in] compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 9215fd602d..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- if(input0->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- // RHS matrix still needs padding on the X
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
-
- window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
- // We still need padding on the X dimension for the RHS matrix
- auto padding_info = get_padding_info({ input0, output });
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
- const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
- // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
- // NOTE: This might have implications on heuristics and performance
- const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- std::string kernel_name("gemmlowp_mm_native");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k());
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 125f0c6948..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyNativeKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
- *
- * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor info for the RHS matrix. Data type supported: same as @p input0
- * @param[in] output Output tensor info. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
- bool _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 848f272e50..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
- Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- return std::make_pair(Status{}, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
- _k = gemm_info.k();
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- auto padding_info = get_padding_info({ input0, input1, output });
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-
- const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
- const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
- std::string kernel_name("gemmlowp_mm_reshaped_");
- kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
- kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k());
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 100100b1b1..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyReshapedKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
- *
- * @param[in] input0 Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] output Output tensor info. Data type supported: S32
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 2,3,4,8,16
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_output_as_3d;
- unsigned int _k;
- bool _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index d39900a561..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- if(input0->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
- const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
- const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m;
- const int n = gemm_info.n;
- const int k = gemm_info.k;
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- if(output_stage.type == GEMMLowpOutputStageType::NONE)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
- }
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
- "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
- // Checks performed if the output stage needs to be fused
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(gemm_info.a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]);
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(gemm_info.b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if mm result is a 3D reinterpretation
- const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2]));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]);
-
- if(expected_output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- TensorShape collapsed_output_shape(expected_output_shape);
- collapsed_output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx],
- "vector_sum_row must have the same number of batches of output tensor");
-
- if(gemm_info.a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
- }
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
- if(output_multipliers != nullptr && output_shifts != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(output_stage.is_quantized_per_channel)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0));
- }
- }
- }
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
- ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
- if(output_stage.type != GEMMLowpOutputStageType::NONE)
- {
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type));
- }
- else
- {
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32));
- }
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
- num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- if(gemm_info.a_offset != 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
- }
- // No access window needed for vector_sum_row
- ARM_COMPUTE_UNUSED(vector_sum_row);
-
- if(bias != nullptr)
- {
- AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, bias_access);
- }
-
- if(output_multipliers != nullptr && output_multipliers->dimension(0) > 1)
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
- }
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
- : _input0(nullptr),
- _input1(nullptr),
- _output(nullptr),
- _vector_sum_col(nullptr),
- _vector_sum_row(nullptr),
- _bias(nullptr),
- _output_multipliers(nullptr),
- _output_shifts(nullptr),
- _slide_matrix_b(true),
- _reinterpret_input_as_3d(false),
- _reinterpret_output_as_3d(false),
- _use_dummy_work_items(false),
- _is_quantized_per_channel(false),
- _fuse_output_stage(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
- const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
- const GEMMKernelInfo &gemm_info,
- const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(),
- input1->info(),
- output->info(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output_multipliers != nullptr ? output_multipliers->info() : nullptr,
- output_shifts != nullptr ? output_shifts->info() : nullptr));
-
- auto padding_info = get_padding_info({ input0, input1, output, vector_sum_row });
- const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
- const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
- const int32_t a_offset = gemm_info.a_offset;
- const int32_t b_offset = gemm_info.b_offset;
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _bias = bias;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(),
- input1->info(),
- output->info(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output_multipliers != nullptr ? output_multipliers->info() : nullptr,
- output_shifts != nullptr ? output_shifts->info() : nullptr,
- num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
- // NOTE: This might have implications on heuristics and performance
- const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % internal_m0;
- const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
- std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
-
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- kernel_name += "_fused_output_stage_fixedpoint";
- _fuse_output_stage = true;
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0 && vector_sum_col != nullptr)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
- build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
- const int min = output_stage.gemmlowp_min_bound;
- const int max = output_stage.gemmlowp_max_bound;
-
- PixelValue min_val{};
- PixelValue max_val{};
- std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
- build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
- bias != nullptr ? bias->clone().get() : nullptr,
- output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
- output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- idx++;
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- idx++;
- }
-
- if(_fuse_output_stage)
- {
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
- }
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 222a8615e4..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
- * Only the following values are supported for LHS info:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * Only the following values are supported for RHS info:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * rhs_info.transpose: true
- * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
- * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
- * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
- * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
- const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
- * Only the following values are supported for LHS info:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * Only the following values are supported for RHS info:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * rhs_info.transpose: true
- * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
- * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
- * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
- * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
- const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
- *
- * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in] output Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
- * Only the following values are supported for LHS info:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * Only the following values are supported for RHS info:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same as lhs_info.k0
- * rhs_info.transpose: true
- * @param[in] vector_sum_col (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
- * @param[in] vector_sum_row (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
- * @param[in] bias (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
- * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- * @param[in] output_shifts (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
- const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
- const ITensorInfo *output_shifts = nullptr);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- const ICLTensor *_vector_sum_col;
- const ICLTensor *_vector_sum_row;
- const ICLTensor *_bias;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
- bool _use_dummy_work_items;
- bool _is_quantized_per_channel;
- bool _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index c7844b9c28..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- int32_t a_offset, int32_t b_offset)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
- }
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
- TensorShape output_shape = mm_result->tensor_shape();
- if(output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
- "mm_result tensor must have the same number of batches of output tensor");
-
- if(a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
-{
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
- int32_t b_offset)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
- const ICLTensor *bias,
- int32_t k, int32_t a_offset,
- int32_t b_offset)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- a_offset, b_offset)); // NOLINT
-
- auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _bias = bias;
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
- build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
- build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- std::string kernel_name("gemmlowp_offset_contribution");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name + "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- int32_t a_offset, int32_t b_offset)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
- return Status{};
-}
-
-void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _mm_result, slice);
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index f8705595a0..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- * (vector_sum_col[k] * a_offset) +
- * (vector_sum_row[i] * b_offset) +
- * (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLGEMMLowpOffsetContributionKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] k Number of matrix A columns or Matrix B rows
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- */
- void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] k Number of matrix A columns or Matrix B rows
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- */
- void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
- int32_t b_offset);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
- *
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_vector_sum_col;
- const ICLTensor *_vector_sum_row;
- ICLTensor *_mm_result;
- const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index b41d8704bd..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(output_stage.is_quantized_per_channel)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
- }
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
- TensorShape output_shape = mm_result->tensor_shape();
- if(output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
- "mm_result tensor must have the same number of batches of output tensor");
-
- if(a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
- // Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
- return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
- : _mm_result(nullptr),
- _vector_sum_col(nullptr),
- _vector_sum_row(nullptr),
- _bias(nullptr),
- _output(nullptr),
- _output_multipliers(nullptr),
- _output_shifts(nullptr),
- _is_quantized_per_channel(false)
-{
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
- int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
- const ICLTensor *bias, ICLTensor *output,
- int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- a_offset, b_offset, output_stage,
- output_multipliers->info(), output_shifts->info())); // NOLINT
-
- auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts });
-
- const int min = output_stage.gemmlowp_min_bound;
- const int max = output_stage.gemmlowp_max_bound;
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _bias = bias;
- _output = output;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
- // Auto initialize the output
- auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type));
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
- build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
- build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
- build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- PixelValue min_val{};
- PixelValue max_val{};
- std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
- build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
- std::string kernel_name("gemmlowp_offset_contribution");
- kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name + "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
- return Status{};
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _mm_result, slice);
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 15f54d17a5..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLGEMMLowpOffsetContributionOutputStageKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
- * @param[in] k Number of matrix A columns or Matrix B rows
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_stage GEMMLowp output stage info
- * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- */
- void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
- const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
- * @param[in] k Number of matrix A columns or Matrix B rows
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_stage GEMMLowp output stage info
- * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
- int32_t k,
- int32_t a_offset, int32_t b_offset,
- const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
- *
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_stage GEMMLowp output stage info
- * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- * @param[in] output_shifts Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
- * Supported data types: S32
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
- int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_mm_result;
- const ICLTensor *_vector_sum_col;
- const ICLTensor *_vector_sum_row;
- const ICLTensor *_bias;
- ICLTensor *_output;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 6a58d5e202..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *info)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
- auto padding_info = get_padding_info({ input, bias, output });
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
- // Set the arguments to pass at compile time
- auto min = info->gemmlowp_min_bound;
- auto max = info->gemmlowp_max_bound;
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
- build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
- build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
- "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
- "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
deleted file mode 100644
index 8653102cd8..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
- * The following computations will be performed by the kernel:
- *
- * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
- * -# Add bias to final result if bias tensor is not a nullptr
- * -# Round to nearest division by a power-of-two using result_shift
- * -# Add offset to each result
- * -# Clamp the value between the specified min and max bounds
- * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
- * @param[in] info Output stage info. Used to pass the quantized output data type
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
- * @param[in] info Output stage info. Used to pass the quantized output data type
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_bias;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index a5888a5ded..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
- ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
- || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-class Coordinates;
-CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *info)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
- auto padding_info = get_padding_info({ input, bias, output });
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
- auto min = info->gemmlowp_min_bound;
- auto max = info->gemmlowp_max_bound;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
- build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 0a8d5e1942..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- * -# Add bias to final result if bias tensor is not a nullptr
- * -# Requantize
- * -# Add offset to each result
- * -# Clamp the value between the specified min and max bounds
- * -# Clamp the resulting int32 values to
- * - to the [0..255] range and cast to QASYMM8.
- * - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] info Output stage info. Used to pass the quantized output data type
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] info Output stage info. Used to pass the quantized output data type
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] info Output stage info. Used to pass the quantized output data type
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_bias;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 7d4352479c..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
- || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} //namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *output_stage)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (bias != nullptr) ? bias->info() : nullptr,
- output->info(),
- output_stage));
-
- auto padding_info = get_padding_info({ input, bias, output });
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
- // Set the arguments to pass at compile time
- auto min = output_stage->gemmlowp_min_bound;
- auto max = output_stage->gemmlowp_max_bound;
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
- build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
- "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
- "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index abdf33ea43..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- * -# Add offset terms to final result
- * -# Multiply each entry of result by result_mult_int
- * -# Add bias to final result if bias tensor is not a nullptr
- * -# Shift the int32 accumulator by result_shift
- * -# Clamp the value between the specified min and max bounds
- * -# Clamp the resulting int32 values:
- * -# -to the [0..255] range and cast to QASYMM8.
- * -# -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLGEMMLowpQuantizeDownInt32ScaleKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] output_stage GEMMLowp output stage metadata.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] output_stage GEMMLowp output stage metadata.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] output_stage GEMMLowp output stage metadata.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_bias;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
deleted file mode 100644
index d508bf6f21..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
- }
- return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
- }
- return Status{};
-}
-} // namespace
-
-ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
- : _input(), _output()
-{
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info);
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
-
- auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
-
- _input = mtx_a;
- _output = vector_sum_row;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type()));
- build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
- std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- // This kernel does not need padding
- Window win = calculate_max_window(*vector_sum_row->info(), Steps());
- ICLKernel::configure_internal(win);
-
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(2));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
- Window slice_in = collapsed.first_slice_window_2D();
- Window slice_out = collapsed.first_slice_window_2D();
-
- // Setup input slice. Its dimensions are increased in the cl kernel.
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info);
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
- _input = mtx_b;
- _output = vector_sum_col;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32);
-
- auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
-
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0));
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration));
- build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
- build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type()));
- build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
- Window slice_out = collapsed.first_slice_window_2D();
- Window slice_in = slice_out;
-
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 237d8099b7..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
- /** Constructor */
- ICLGEMMLowpReductionKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
- /** Allow instances of this class to be moved */
- ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
- /** Allow instances of this class to be moved */
- ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
- * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
- * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
- * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
- * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
- *
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
- * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
- * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
- * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
- *
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
- * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
deleted file mode 100644
index 6d3b1e5897..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (input2 != nullptr)
- && (!reshape_info.broadcast_bias()),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-
- if(!is_interleaved_transposed)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int m = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1);
- const unsigned int n = input1->dimension(0);
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(reshape_info.broadcast_bias())
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
- }
- else
- {
- GEMMRHSMatrixInfo rhs_info;
- GEMMLHSMatrixInfo lhs_info;
- const auto m = static_cast<unsigned int>(reshape_info.m());
- const auto n = static_cast<unsigned int>(reshape_info.n());
- const int k = reshape_info.k();
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
- rhs_info.n0 = max_cl_vector_width / input1->element_size();
- rhs_info.k0 = 1;
- rhs_info.h0 = mult_transpose1xW_width;
- rhs_info.interleave = false;
- rhs_info.transpose = false;
- lhs_info.m0 = 4;
- lhs_info.k0 = 4;
- lhs_info.v0 = mult_interleave4x4_height;
- lhs_info.interleave = true;
- lhs_info.transpose = true;
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(reshape_info.broadcast_bias())
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
- float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
- ElementsProcessed &num_elements_processed)
-{
- ARM_COMPUTE_UNUSED(beta);
- bool window_changed = false;
- Window win{};
- Window win_out{};
-
- const DataType data_type = input0->data_type();
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- if(is_interleaved_transposed)
- {
- // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
- ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- num_elems_processed_per_iteration_y = 4;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input2_access); // window used by the execute_window_loop
- }
- }
- else // The input tensors have not been reshaped
- {
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
- num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
- // Create kernels according to the architecture, data type and input size.
- GPUTarget arch_target = get_arch_from_target(gpu_target);
- if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
- {
- num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
- }
-
- // Configure window
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- output->dimension(0),
- output->dimension(1));
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- input2->dimension(1));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),
- _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
- float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
- is_interleaved_transposed, reshape_info, fp_mixed_precision));
-
- auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
- _add_bias = _input2 != nullptr;
- _broadcast_bias = reshape_info.broadcast_bias();
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
-
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- const DataType data_type = input0->info()->data_type();
-
- // Get target architecture
- GPUTarget gpu_target = get_target();
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
- gpu_target, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
- // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1)
- const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
- const unsigned int n = output->info()->dimension(0);
-
- const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
- const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
- const unsigned int m0 = num_elements_processed.y();
- const unsigned int n0 = num_elements_processed.x();
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % m0;
- const unsigned int partial_store_n0 = n % n0;
-
- // Create build options
- CLBuildOptions build_opts;
-
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
- build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
- build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
-
- const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
- std::string kernel_name;
- if(is_interleaved_transposed)
- {
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
- if(is_data_type_float(data_type) && is_bifrost)
- {
- kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- }
- else
- {
- kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
- if(fp_mixed_precision && data_type == DataType::F16)
- {
- // currently wider accumulator is only supported for fp16 kernels.
- kernel_name += "_acc32";
- }
- }
- }
- else // The input tensors have not been reshaped
- {
- build_opts.add_option("-DN=" + support::cpp11::to_string(n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
- // Create kernels according to the architecture, data type and input size.
- if(is_data_type_float(data_type) && is_bifrost)
- {
- kernel_name = "gemm_mm_floating_point";
-
- if(input0->info()->num_dimensions() != 1)
- {
- kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- if(fp_mixed_precision && data_type == DataType::F16)
- {
- // currently wider accumulator is only supported for fp16 kernels.
- kernel_name += "_acc32";
- }
- }
- else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
- {
- // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
- // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
- // FC6 and FC7 of AlexNet and VGG-16).
- kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
- }
-
- // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
- // via exhaustive autotuning over a range of representative layer configurations.
- set_lws_hint(cl::NDRange(4));
- }
- else // (MIDGARD and F32) or (F16)
- {
- kernel_name = "gemm_mm_floating_point";
- }
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = "gemm_";
- _config_id += (is_interleaved_transposed ? "reshaped_" : "");
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(3));
- _config_id += "_";
- _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- // Note: num_elements_processed will be set in validate_and_configure_window()
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(activation_info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- (input2 != nullptr) ? input2->clone().get() : nullptr,
- output->clone().get(),
- beta,
- is_interleaved_transposed,
- reshape_info,
- gpu_target,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- if(_add_bias)
- {
- add_2D_tensor_argument(idx, _input2, slice);
- }
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 71d223b8ac..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- * For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel" and @ref CLGEMMReshapeRHSMatrixKernel,
- * the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p input1 tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLGEMMMatrixMultiplyKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLGEMMMatrixMultiplyKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
- /** Initialise the kernel's input, output and alpha
- *
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32
- * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0
- * @param[in] input2 Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
- * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
- * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
- * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
- * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication
- *
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
- bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
- /** Initialise the kernel's input, output and alpha
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32
- * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0
- * @param[in] input2 Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
- * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
- * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
- * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
- * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication
- *
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
- bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
- *
- * @param[in] input0 Input tensor containing the Matrix A info. Data types supported: F16/F32
- * @param[in] input1 Input tensor containing the Matrix B info. Data type supported: same as @p input0
- * @param[in] input2 Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as @p input0
- * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
- * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
- * @param[in] reshape_info GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
- * @param[in] gpu_target GPU Target
- * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
- * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- const ICLTensor *_input2;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
- bool _add_bias;
- bool _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index f07166e4bb..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != n);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != k);
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
- }
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- input0->dimension(0),
- input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- output->dimension(0),
- output->dimension(1));
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- input2->dimension(1));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyNativeKernel::CLGEMMMatrixMultiplyNativeKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
- _add_bias(false), _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- auto padding_info = get_padding_info({ input0, output });
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
- const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
- const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
- // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
- // NOTE: This might have implications on heuristics and performance
- const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
- std::string kernel_name("gemm_mm_native");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4;
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- }
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- if(_add_bias)
- {
- add_2D_tensor_argument(idx, _input2, slice);
- }
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 6b6004b464..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class CLGEMMMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMMatrixMultiplyNativeKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyNativeKernel(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyNativeKernel &operator=(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyNativeKernel(CLGEMMMatrixMultiplyNativeKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyNativeKernel &operator=(CLGEMMMatrixMultiplyNativeKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input0 Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor info. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same of lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor info. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same of lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyNativeKernel
- *
- * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor info for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0.
- * @param[in] output Output tensor info. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: same of lhs_info.k0
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- const ICLTensor *_input2;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
- bool _use_dummy_work_items;
- bool _add_bias;
- bool _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 9f1ffa48eb..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (input0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
- ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- input0->dimension(0),
- input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- input1->dimension(0),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- output->dimension(0),
- output->dimension(1));
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), _add_bias(false),
- _broadcast_bias(false), _export_to_cl_image(false), _k(1)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- auto padding_info = get_padding_info({ input0, output });
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
- _export_to_cl_image = rhs_info.export_to_cl_image;
- _k = gemm_info.k;
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- const bool enable_mixed_precision = gemm_info.fp_mixed_precision;
- const DataType data_type = input0->info()->data_type();
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
- const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
- build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
- build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
- build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
- std::string kernel_name("gemm_mm_reshaped_");
- kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
- kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
- kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-
- cl::Image2D input1_image2d;
-
- if(_export_to_cl_image)
- {
- const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
- const size_t image_row_pitch = _input1->info()->strides_in_bytes()[1];
-
- input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
-
- // LHS buffer
- add_2D_tensor_argument(idx, _input0, slice);
-
- // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
- if(_export_to_cl_image)
- {
- _kernel.setArg(idx++, input1_image2d);
- }
- else
- {
- add_2D_tensor_argument(idx, _input1, slice_b);
- }
-
- // Bias buffer (_add_bias == true)
- add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
-
- // Output buffer
- add_2D_tensor_argument(idx, _output, slice);
-
- // K dimension (not used if _export_to_cl_image == true)
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
- // LHS stride_z
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-
- // RHS stride_z (not used if _export_to_cl_image == true)
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-
- // Bias stride_z (if _add_bias == true)
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
-
- // Output stride_z
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-
- // Cross-plan padding (if _reinterpret_output_as_3d = true)
- if(_reinterpret_output_as_3d)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- // Dispatch kernel
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 2ffc322def..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMMatrixMultiplyReshapedKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyReshapedKernel(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyReshapedKernel &operator=(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyReshapedKernel(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyReshapedKernel &operator=(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
- * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
- * multiplications. i.e. float c = (half)a * (half)b
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Initialise the kernel's input and output.
- *
- * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
- * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
- * multiplications. i.e. float c = (half)a * (half)b
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
- *
- * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
- * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
- * multiplications. i.e. float c = (half)a * (half)b
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
- * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0.
- * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.transpose: false
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
- * rhs_info.transpose: true
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @note lhs_info.k0 must be equal to rhs_info.k0
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- const ICLTensor *_input2;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_output_as_3d;
- bool _use_dummy_work_items;
- bool _add_bias;
- bool _broadcast_bias;
- bool _export_to_cl_image;
- unsigned int _k;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 3dee4f24cd..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
- ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input0);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- // This approach should only be used when the input/output tensors have pad on the y direction
- if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- input2->dimension(1));
-
- window_changed = update_window_and_padding(win, input2_access);
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
- _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
- float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
- _export_to_cl_image = rhs_info.export_to_cl_image;
- _has_pad_y = gemm_info.has_pad_y;
-
- auto padding_info = get_padding_info({ input0, input1, output });
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- // These variables are used only if gemm_info.has_pad_y == true
- const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
- const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
- // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
- // NOTE: This might have implications on heuristics and performance
- const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % internal_m0;
- const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
- build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- if(_has_pad_y)
- {
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- }
-
- std::string kernel_name("gemm_mm_reshaped_only_rhs_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
- kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_has_pad_y ? "" : "no_pad_y_");
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
- const size_t rhs_idx_batch_size = 2u;
- const size_t bia_idx_batch_size = 2u;
- const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- // Get cross plane pads
- const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom;
- const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom;
-
- // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
- ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
-
- cl::Image2D input1_image2d;
-
- if(_export_to_cl_image)
- {
- const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
- const size_t image_row_pitch = _input1->info()->strides_in_bytes()[1];
-
- input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
-
- // LHS buffer
- add_2D_tensor_argument(idx, _input0, slice);
-
- // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
- if(_export_to_cl_image)
- {
- _kernel.setArg(idx++, input1_image2d);
- }
- else
- {
- add_2D_tensor_argument(idx, _input1, slice_b);
- }
-
- // Bias buffer (_add_bias == true)
- add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
-
- // Output buffer
- add_2D_tensor_argument(idx, _output, slice);
-
- // LHS stride_z
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[lhs_idx_batch_size]));
-
- // RHS stride_z (not used if _export_to_cl_image == true)
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[rhs_idx_batch_size]));
-
- // Bias stride_z (if _add_bias == true)
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[bia_idx_batch_size]));
- }
-
- // Output stride_z
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[out_idx_batch_size]));
-
- // Cross-plan padding (if _reinterpret_input_as_3d = true)
- if(_reinterpret_input_as_3d && _has_pad_y)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
- }
-
- // Cross-plan padding (if _reinterpret_output_as_3d = true)
- if(_reinterpret_output_as_3d && _has_pad_y)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
- }
-
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 5b96679a46..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMMatrixMultiplyReshapedOnlyRHSKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
- * The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.k0: 2,3,4,8,16
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.transpose: true,false
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Initialise the kernel's input and output.
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
- * The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.k0: 2,3,4,8,16
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.transpose: true,false
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
- * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
- * the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
- * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- *
- * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
- * The number of dimensions for the LHS matrix must be less or equal than 4.
- * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
- * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0.
- * @param[in] output Output tensor info. Data type supported: same as @p input0
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
- * lhs_info.m0: 1,2,3,4,5,6,7,8
- * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.k0: 2,3,4,8,16
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.transpose: true,false
- * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- const ICLTensor *_input2;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
- bool _use_dummy_work_items;
- bool _add_bias;
- bool _broadcast_bias;
- bool _export_to_cl_image;
- bool _has_pad_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
deleted file mode 100644
index 52510075b7..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
- const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
- bool window_changed = false;
-
- TensorInfo tmp_info(*input);
-
- if(reinterpret_input_as_3d)
- {
- // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(input->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
-
- // Configure window
- Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input_access(input, 0, 0,
- input->dimension(0),
- input->dimension(1));
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
- window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
- update_window_and_padding(win, output_access); // window used to update the padding requirements of output tensor
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeLHSMatrixKernel::CLGEMMReshapeLHSMatrixKernel()
- : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
-{
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, lhs_info, reinterpret_input_as_3d);
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
-
- auto padding_info = get_padding_info({ input });
-
- _input = input;
- _output = output;
- _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
- const unsigned int src_w = input->info()->dimension(0);
- const unsigned int src_h = _reinterpret_input_as_3d ? input->info()->dimension(1) * input->info()->dimension(2) : input->info()->dimension(1);
- const unsigned int partial_load_m0 = src_h % lhs_info.m0;
- const unsigned int partial_load_k0 = src_w % lhs_info.k0;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
- build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
- build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
- build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
-
- std::string kernel_name("gemm_reshape_lhs_matrix_");
- kernel_name += lhs_info.transpose ? "t" : "nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), lhs_info, reinterpret_input_as_3d);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "gemm_reshape_lhs_matrix_";
- _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.transpose);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, lhs_info, reinterpret_input_as_3d));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
- return Status{};
-}
-
-void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 2 * num_arguments_per_3D_tensor();
- const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
deleted file mode 100644
index 92202a26fc..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- * In particular, this function splits the input matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- * stores each one in the output matrix unrolling the values
- */
-class CLGEMMReshapeLHSMatrixKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLGEMMReshapeLHSMatrixKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapeLHSMatrixKernel(const CLGEMMReshapeLHSMatrixKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapeLHSMatrixKernel &operator=(const CLGEMMReshapeLHSMatrixKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMReshapeLHSMatrixKernel(CLGEMMReshapeLHSMatrixKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMReshapeLHSMatrixKernel &operator=(CLGEMMReshapeLHSMatrixKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: All
- * @param[out] output Output tensor. Data type supported: same as @p input
- * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.v0: greater than 0
- * lhs_info.transpose: true, false
- * lhs_info.interleave: true, false
- * @param[in] reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
- */
- void configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: All
- * @param[out] output Output tensor. Data type supported: same as @p input
- * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.v0: greater than 0
- * lhs_info.transpose: true, false
- * lhs_info.interleave: true, false
- * @param[in] reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeLHSMatrixKernel
- *
- * @param[in] input Input tensor info. Data types supported: All
- * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
- * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * lhs_info.m0: 2,3,4,5,6,7,8
- * lhs_info.k0: 2,3,4,8,16
- * lhs_info.v0: greater than 0
- * lhs_info.transpose: true, false
- * lhs_info.interleave: true, false
- * @param[in] reinterpret_input_as_3d True if the input has to be reinterpreted as 3D tensor
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d);
-
- // Inherited methods overridden
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- bool _reinterpret_input_as_3d;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
deleted file mode 100644
index 33de61ed01..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/gemm/CLGEMMHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(rhs_info.export_to_cl_image)
- {
- const TensorInfo tensor_reshaped_info(compute_rhs_reshaped_shape(*input, rhs_info), 1, input->data_type());
- ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
- const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
- bool window_changed = false;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*input, rhs_info)));
-
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
- window_changed = update_window_and_padding(win, input_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- if(rhs_info.export_to_cl_image)
- {
- arm_compute::cl_gemm::update_padding_for_cl_image(output);
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeRHSMatrixKernel::CLGEMMReshapeRHSMatrixKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, rhs_info);
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), rhs_info));
-
- _input = input;
- _output = output;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
- build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
- std::string kernel_name("gemm_reshape_rhs_matrix_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), rhs_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGEMMReshapeRHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, rhs_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), rhs_info).first);
-
- return Status{};
-}
-
-void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
deleted file mode 100644
index 911484ea76..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- * In particular, this kernel splits the input matrix in blocks of size K0xN0 and stores each one in
- * the output matrix unrolling the values */
-class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLGEMMReshapeRHSMatrixKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapeRHSMatrixKernel(const CLGEMMReshapeRHSMatrixKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapeRHSMatrixKernel &operator=(const CLGEMMReshapeRHSMatrixKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
- /** Default destructor */
- ~CLGEMMReshapeRHSMatrixKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
- * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32, F16
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- *
- * @param[in] input Input tensor. Data types supported: All
- * @param[out] output Output tensor. Data type supported: same as @p input
- * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.h0: greater than 0
- * rhs_info.transpose: true, false
- * rhs_info.interleave: true, false
- */
- void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
- /** Initialise the kernel's input and output.
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
- * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32, F16
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: All
- * @param[out] output Output tensor. Data type supported: same as @p input
- * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.h0: greater than 0
- * rhs_info.transpose: true, false
- * rhs_info.interleave: true, false
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
- *
- * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
- * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
- * -# rhs_info.n0 can only be 4, 8 and 16
- * -# rhs_info.k0 can only be 4, 8 and 16
- * -# Data type can only be F32, F16
- * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
- * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
- * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
- * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
- *
- * @param[in] input Input tensor info. Data types supported: All
- * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
- * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
- * information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
- * rhs_info.h0: greater than 0
- * rhs_info.transpose: true, false
- * rhs_info.interleave: true, false
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info);
-
- // Inherited methods overridden
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index e33bc7afd7..904bb07282 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLGatherKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -34,20 +36,22 @@ namespace arm_compute
{
namespace
{
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
- ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() + indices->num_dimensions() - 1) > 4);
+
ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -56,26 +60,27 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
// Output auto initialization if not yet initialized
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
auto_init_if_empty((*output), output_shape, 1, input->data_type());
// Create window
Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
} // namespace
-CLGatherKernel::CLGatherKernel()
- : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
@@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
}
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
- auto padding_info = get_padding_info({ input, output, indices });
+ auto padding_info = get_padding_info({input, output, indices});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
// Configure kernel window
@@ -100,10 +109,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
// Set build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DINDICES_DIMS=" + support::cpp11::to_string(indices->info()->num_dimensions()));
build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+ build_opts.add_option("-DINDEX_LIMIT=" + support::cpp11::to_string(input->info()->tensor_shape()[_axis]));
// Create kernel
_kernel = create_kernel(compile_context, "gather", build_opts.options());
@@ -111,10 +122,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
return Status{};
}
@@ -126,7 +139,7 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, window_collapsed);
- add_1D_tensor_argument(idx, _indices, window_collapsed);
+ add_4D_tensor_argument(idx, _indices, window_collapsed);
add_4D_tensor_argument(idx, _output, window_collapsed);
enqueue(queue, *this, window_collapsed, lws_hint());
}
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696..db4b49d2f5 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLGATHERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -63,7 +64,11 @@ public:
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
*
@@ -74,7 +79,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 988bb39d88..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,13 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
- if(all_anchors->total_size() > 0)
+ if (all_anchors->total_size() > 0)
{
size_t feature_height = info.feat_height();
size_t feature_width = info.feat_width();
@@ -58,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
- if(is_data_type_quantized(anchors->data_type()))
+ if (is_data_type_quantized(anchors->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
}
@@ -67,20 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
}
} // namespace
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
- : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
}
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
- auto padding_info = get_padding_info({ anchors, all_anchors });
+ auto padding_info = get_padding_info({anchors, all_anchors});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
// Metadata
@@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
// Initialize the output if empty
const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
- auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+ auto_init_if_empty(*all_anchors->info(),
+ TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
// Set instance variables
_anchors = anchors;
@@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
- if(is_quantized)
+ if (is_quantized)
{
const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
}
// Create kernel
- const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ const std::string kernel_name =
+ (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
// This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors,
+ const ITensorInfo *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d..e08f281d6c 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
* @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
*
@@ -81,5 +84,5 @@ private:
const ICLTensor *_anchors;
ICLTensor *_all_anchors;
};
-} // arm_compute
+} // namespace arm_compute
#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
deleted file mode 100644
index 07309de83c..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-struct Im2ColConfiguration
-{
- std::string kernel_name{};
- std::set<std::string> build_options{};
- unsigned int num_elems_processed_per_iteration{};
- bool is_padding_required_nchw{};
-};
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
-
- // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
- const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
- const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
- ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
- if(output->total_size() > 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-
- const DataLayout data_layout = input->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int input_width = input->dimension(width_idx);
- const unsigned int input_height = input->dimension(height_idx);
-
- // Configure the execute window based on the selected optimal OpenCL kernel
- bool window_changed = false;
- Window win;
-
- if(data_layout == DataLayout::NHWC)
- {
- win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- const int xin_start = 0;
- const int xin_end = input->dimension(0);
- const int yin_start = 0;
- const int yin_end = input->dimension(1);
-
- const int xout_start = 0;
- const int xout_end = output->dimension(0);
- const int yout_start = 0;
- const int yout_end = output->dimension(1);
-
- AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end);
- AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end);
- window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
- }
- else
- {
- if(is_padding_required_nchw)
- {
- const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
- win = calculate_max_window(*input,
- Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
- AccessWindowStatic input_access(input,
- -border.left,
- -border.top,
- ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
- input_height + border.bottom);
- window_changed = window_changed || update_window_and_padding(win, input_access);
- }
- else
- {
- // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
- // update_window_and_padding() can be skipped
- win = calculate_max_window(*input, Steps());
- }
- }
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
- win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int input_width = input->dimension(width_idx);
- const unsigned int input_height = input->dimension(height_idx);
- const unsigned int input_channel = input->dimension(channel_idx);
-
- const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
- // Im2Col configuration
- std::string kernel_name = "im2col_generic_";
- CLBuildOptions build_opts;
- unsigned int num_elems_processed_per_iteration = 1;
- bool is_padding_required_nchw = false;
- const UniformQuantizationInfo qinfo = input->quantization_info().uniform();
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
- build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
- build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
- build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
- build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
- build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
- build_opts.add_option_if(has_bias, "-DHAS_BIAS");
-
- if(data_layout == DataLayout::NHWC)
- {
- num_elems_processed_per_iteration = std::min(2U, input_channel);
- is_padding_required_nchw = false;
-
- // Only the 3x3 and 9x9 cases are optimized for NHWC
- if(kernel_dims == Size2D(3U, 3U))
- {
- kernel_name = "im2col3x3_";
- }
- else if(kernel_dims == Size2D(9U, 9U))
- {
- kernel_name = "im2col9x9_";
- }
-
- // Get boundary vector (the first/last vector with potentially a partial vector size) size
- // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
- // otherwise, the boundary vec size is the (partial) remainder vector size
- const unsigned int vec_size = num_elems_processed_per_iteration;
- const unsigned int partial_vec_size = input_channel % vec_size;
- const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
- build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
- }
- else
- {
- if(dilation == Size2D(1U, 1U))
- {
- const bool squared_im2col = kernel_dims.width == kernel_dims.height;
- if(squared_im2col)
- {
- // Check if we can run an optimized im2col for NCHW
- switch(kernel_dims.width)
- {
- case 1:
- // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
- if(conv_info.stride().first == 1 && !conv_info.has_padding())
- {
- kernel_name = "im2col1x1_stridex1_";
- num_elems_processed_per_iteration = 4;
- is_padding_required_nchw = true;
- }
- break;
- case 3:
- kernel_name = "im2col3x3_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- break;
- case 5:
- kernel_name = "im2col5x5_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- break;
- case 11:
- // Optimized im2col11x11 if pad_x = pad_y = 0
- if(!conv_info.has_padding())
- {
- kernel_name = "im2col11x11_padx0_pady0_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- }
- break;
- default:
- kernel_name = "im2col_generic_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = false;
- break;
- }
- }
- else if(kernel_dims.width > 1 && !conv_info.has_padding())
- {
- kernel_name = "im2col_generic_padx0_pady0_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = false;
-
- // Optimized im2col is performed using one or more vector operations with the specified vector size
- // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
- // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
- // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
- // Using the vector size of 8, however, may be faster.
- // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
- // is used instead.)
- const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width);
- const size_t width_mod_vector_size = kernel_dims.width % vector_size;
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
- }
- }
- }
-
- // Append the data layout to the kernel_name
- kernel_name += lower_string(string_from_data_layout(data_layout));
-
- Im2ColConfiguration im2col_config;
- im2col_config.kernel_name = kernel_name;
- im2col_config.build_options = build_opts.options();
- im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
- im2col_config.is_padding_required_nchw = is_padding_required_nchw;
-
- return im2col_config;
-}
-} // namespace
-
-CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
-{
-}
-
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
-}
-
-void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
- const Size2D &dilation,
- unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
-
- auto padding_info = get_padding_info({ input, output });
- _data_layout = input->info()->data_layout();
-
- const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int input_width = input->info()->dimension(width_idx);
- const unsigned int input_height = input->info()->dimension(height_idx);
-
- // Select and configure the optimal OpenCL kernel to run.
- // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
- // and the padding requirement flag
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
-
- // Create kernel
- _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
-
- _input = input;
- _output = output;
- _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
- _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
- _kernel_dims = kernel_dims; // Only needed by the Tuner
- _conv_info = conv_info; // Only needed by the Tuner
- _num_groups = num_groups;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw, num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = im2col_config.kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(num_groups);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-
- ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw, num_groups)
- .first);
- return Status{};
-}
-
-void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- // Get initial windows
- // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- window_collapsed.set_dimension_step(Window::DimZ, 1);
-
- Window window_output;
- window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-
- const Window first_slice_3d = window_collapsed.first_slice_window_3D();
-
- Window slice = first_slice_3d;
- Window slice_in = first_slice_3d;
- Window slice_out = window_output.first_slice_window_2D();
-
- if(_data_layout == DataLayout::NHWC)
- {
- const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
- const int num_batches = tmp_win[3].end();
-
- slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
- slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
- }
- else
- {
- slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
- slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
- // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
- }
-
- // Setup input slice
- // The dimensions of the input are increased within the OpenCL kernel
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Setup output slice
- // The dimensions of the output are increased within the OpenCL kernel
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- if(_num_groups == 1)
- {
- add_2D_tensor_argument(idx, _output, slice_out);
- }
- else
- {
- add_3D_tensor_argument(idx, _output, slice_out);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLIm2ColKernel.h b/src/core/CL/kernels/CLIm2ColKernel.h
deleted file mode 100644
index 2920c7d138..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
-#define ARM_COMPUTE_CLIM2COLKERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CLIm2ColKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLIm2ColKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLIm2ColKernel(const CLIm2ColKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLIm2ColKernel(CLIm2ColKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
- /** Set the input and output of the kernel.
- *
- * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
- * while every dimension above represents a batch. Data types supported: Same as @p input
- * @param[in] kernel_dims The kernel dimensions (width and height).
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] has_bias In case biases are provided expands the matrix with 1.
- * This is valid only for non-quantized inputs.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution.
- * Number of groups other than 1 is only supported for NCHW data layout.
- * Number of groups should be multiple to the number of channels.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
- unsigned int num_groups = 1);
- /** Set the input and output of the kernel.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
- * while every dimension above represents a batch. Data types supported: Same as @p input
- * @param[in] kernel_dims The kernel dimensions (width and height).
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] has_bias In case biases are provided expands the matrix with 1.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
- const Size2D &dilation = Size2D(1U, 1U),
- unsigned int num_groups = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
- *
- * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
- * while every dimension above represents a batch. Data types supported: Same as @p input
- * @param[in] kernel_dims The kernel dimensions (width and height).
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] has_bias In case biases are provided expands the matrix with 1.
- * This is valid only for non-quantized inputs.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution.
- * Number of groups other than 1 is only supported for NCHW data layout.
- * Number of groups should be multiple to the number of channels.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
- unsigned int num_groups = 1);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
- const ICLTensor *_input;
- ICLTensor *_output;
- DataLayout _data_layout;
- std::pair<unsigned int, unsigned int> _convolved_dims;
- unsigned int _num_elems_processed_per_iteration;
- Size2D _kernel_dims;
- PadStrideInfo _conv_info;
- unsigned int _num_groups;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index f9e1cbec27..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,65 +29,154 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+ if (output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ bool use_mixed_precision)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ auto padding_info = get_padding_info({input, output});
+
+ _input = input;
+ _output = output == nullptr ? input : output;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_meanvar(_input->info(), _output->info()));
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+ (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+ // Create kernel
+ _kernel = create_kernel(compile_context, "compute_mean_var", build_opts.options());
+
// We handle the planes manually
- Window win = calculate_max_window(*input, Steps(1));
+ Window win = calculate_max_window(*(input->info()), Steps(1));
+ const auto data_layout = input->info()->data_layout();
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int batches_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ const unsigned int input_channel = input->info()->dimension(channel_idx);
+ const unsigned int input_batches = input->info()->dimension(batches_idx);
+ const TensorShape out_shape(input_channel, 2u, input_batches);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+ if (use_mixed_precision)
+ {
+ auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
+ }
+ else
+ {
+ auto_init_if_empty(*_output->info(), out_shape, 1, input->info()->data_type());
+ }
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
- // CLInstanceNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- return std::make_pair(Status{}, win);
+Status CLComputeMeanVariance::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_meanvar(input, output));
+ return Status{};
}
-} // namespace
-CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window collapsed_window = window.collapse(window, Window::DimZ);
+
+ // We will process the planes together
+ if (_input->info()->data_layout() == DataLayout::NCHW)
+ {
+ collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ }
+ else
+ {
+ collapsed_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(3), 1));
+ }
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, collapsed_window);
+ add_3D_tensor_argument(idx, _output, collapsed_window);
+
+ enqueue(queue, *this, collapsed_window, lws_hint());
}
-void CLInstanceNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _run_in_place(false)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output == nullptr ? input : output;
+ _mean = mean_var;
_run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), info));
@@ -95,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+ ? "float"
+ : get_cl_type_from_data_type(input->info()->data_type())));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -110,16 +201,21 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
_kernel = create_kernel(compile_context, "instance_normalization", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- ICLKernel::configure_internal(std::get<1>(win_config));
+ Window win = calculate_max_window(*input->info(), Steps(1));
+ if (output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
+ }
+
+ ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
@@ -131,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
Window collapsed_window = window.collapse(window, Window::DimZ);
// We will process the planes together
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if (_input->info()->data_layout() == DataLayout::NCHW)
{
collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -144,7 +240,9 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, collapsed_window);
- if(!_run_in_place)
+ add_3D_tensor_argument(idx, _mean, collapsed_window);
+
+ if (!_run_in_place)
{
add_4D_tensor_argument(idx, _output, collapsed_window);
}
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index d4444f0b20..9f436da7f6 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-#include "src/core/CL/ICLKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
+
namespace arm_compute
{
// Forward declarations
@@ -52,21 +52,18 @@ public:
/** Set the input and output tensors.
*
- * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
- * In case of @p output tensor = nullptr this tensor will store the result of the normalization.
- * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
- * @param[in] info Kernel meta-data descriptor
- */
- void configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
- /** Set the input and output tensors.
- *
* @param[in] compile_context The compile context to be used.
* @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
* In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+ * @param[in] mean_var Tensor containing the precomputed mean and variance values. Data types supported: F32.
* @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
* @param[in] info Kernel meta-data descriptor
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
*
@@ -76,7 +73,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -84,7 +82,53 @@ public:
private:
ICLTensor *_input;
ICLTensor *_output;
+ ICLTensor *_mean;
bool _run_in_place;
};
+
+/** Interface for compute Mean and Variance per channel */
+class CLComputeMeanVariance : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLComputeMeanVariance();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeMeanVariance(const CLComputeMeanVariance &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeMeanVariance &operator=(const CLComputeMeanVariance &) = delete;
+ /** Default Move Constructor. */
+ CLComputeMeanVariance(CLComputeMeanVariance &&) = default;
+ /** Default move assignment operator */
+ CLComputeMeanVariance &operator=(CLComputeMeanVariance &&) = default;
+ /** Default destructor */
+ ~CLComputeMeanVariance() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+ * In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+ * @param[in] use_mixed_precision Use mixed precision in case of FP16 execution
+ */
+ void
+ configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 9e91d98f7c..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,11 +29,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
@@ -42,9 +43,8 @@ namespace
{
constexpr int max_input_tensor_dim = 3;
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -54,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+ "Actual normalization axis greater than max number of dimensions");
// Reduce shape on axis
TensorShape sum_shape = input->tensor_shape();
sum_shape.set(actual_axis, 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,40 +72,30 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
return Status{};
}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
- return std::make_tuple(err, win);
-}
} // namespace
CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
: _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+ const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
{
configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
}
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
+ auto padding_info = get_padding_info({input, sum, output});
_input = input;
_sum = sum;
@@ -112,35 +103,40 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
_actual_axis = wrap_around(axis, max_input_tensor_dim);
_epsilon = epsilon;
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
// Create kernel
std::string kernel_name;
unsigned int idx = 0;
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
- kernel_name = "x";
+ kernel_name = "l2_normalize_x";
idx = num_arguments_per_2D_tensor() * 3;
break;
case 1:
- kernel_name = "y";
+ kernel_name = "l2_normalize_y";
idx = num_arguments_per_2D_tensor() * 3;
break;
case 2:
- kernel_name = "z";
+ kernel_name = "l2_normalize_z";
idx = num_arguments_per_3D_tensor() * 3;
break;
default:
ARM_COMPUTE_ERROR("Axis not supported");
}
- _kernel = create_kernel(compile_context, "l2_normalize_" + kernel_name, build_opts);
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set epsilon argument
- if(input->info()->data_type() == DataType::F32)
+ if (input->info()->data_type() == DataType::F32)
{
_kernel.setArg<cl_float>(idx, _epsilon);
}
@@ -150,17 +146,19 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
}
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ Window win = calculate_max_window(*input->info(), Steps(vec_size_x));
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
- ICLKernel::configure_internal(std::get<1>(win_config));
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
return Status{};
}
@@ -171,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
Window window_sum(window);
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
{
@@ -185,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 1:
@@ -201,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 2:
@@ -217,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_3D_tensor_argument(idx, _sum, sum_slice);
add_3D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+ } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217..5c9ab94ce5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -70,7 +71,12 @@ public:
* @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
* @param[in] epsilon Lower bound value for the normalization.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon);
/** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
*
@@ -84,7 +90,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
deleted file mode 100644
index a439c2448e..0000000000
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLLKTrackerKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-{
- configure(CLKernelLibrary::get().get_compile_context(), old_points, new_points_estimates, old_points_internal, new_points_internal, use_initial_estimate, level, num_levels, pyramid_scale);
-}
-
-void CLLKTrackerInitKernel::configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-
-{
- ARM_COMPUTE_ERROR_ON(old_points == nullptr);
- ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-
- const float scale = std::pow(pyramid_scale, level);
-
- // Create kernel
- std::string kernel_name = "init_level";
- if(level == (num_levels - 1))
- {
- kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
- }
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set static kernel arguments
- unsigned int idx = 0;
- if(level == (num_levels - 1))
- {
- _kernel.setArg(idx++, old_points->cl_buffer());
- if(use_initial_estimate)
- {
- _kernel.setArg(idx++, new_points_estimates->cl_buffer());
- }
- }
- _kernel.setArg(idx++, old_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg<cl_float>(idx++, scale);
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
- ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-{
- configure(CLKernelLibrary::get().get_compile_context(), new_points_internal, new_points);
-}
-
-void CLLKTrackerFinalizeKernel::configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-
-{
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points == nullptr);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "finalize");
-
- // Set static kernel arguments
- unsigned int idx = 0;
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points->cl_buffer());
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
- ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
- : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
-{
-}
-
-void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level)
-{
- configure(CLKernelLibrary::get().get_compile_context(), old_input, old_scharr_gx, old_scharr_gy, old_points_internal, new_points_internal, coeff_table, old_ival, window_dimension, level);
-}
-
-void CLLKTrackerStage0Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level)
-
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
- ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
- _old_input = old_input;
- _old_scharr_gx = old_scharr_gx;
- _old_scharr_gy = old_scharr_gy;
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const ValidRegion valid_region = intersect_valid_regions(
- old_input->info()->valid_region(),
- old_scharr_gx->info()->valid_region(),
- old_scharr_gy->info()->valid_region());
-
- update_window_and_padding(window,
- AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)),
- AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)),
- AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)));
-
- ICLKernel::configure_internal(window);
-
- // Initialize required variables
- const int level0 = (level == 0) ? 1 : 0;
- const int window_size = window_dimension;
- const int window_size_squared = window_dimension * window_dimension;
- const int window_size_half = window_dimension / 2;
- const float eig_const = 1.0f / (2.0f * window_size_squared);
- const cl_float3 border_limits =
- {
- {
- // -1 because we load 2 values at once for bilinear interpolation
- static_cast<cl_float>(valid_region.end(0) - window_size - 1),
- static_cast<cl_float>(valid_region.end(1) - window_size - 1),
- static_cast<cl_float>(valid_region.start(0))
- }
- };
-
- // Create kernel
- _kernel = create_kernel(compile_context, "lktracker_stage0");
-
- // Set arguments
- unsigned int idx = 3 * num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, old_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, coeff_table->cl_buffer());
- _kernel.setArg(idx++, old_ival->cl_buffer());
- _kernel.setArg<cl_int>(idx++, window_size);
- _kernel.setArg<cl_int>(idx++, window_size_squared);
- _kernel.setArg<cl_int>(idx++, window_size_half);
- _kernel.setArg<cl_float3>(idx++, border_limits);
- _kernel.setArg<cl_float>(idx++, eig_const);
- _kernel.setArg<cl_int>(idx++, level0);
-}
-
-void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Set static tensor arguments. Setting here as allocation might be deferred.
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _old_input, window);
- add_2D_tensor_argument(idx, _old_scharr_gx, window);
- add_2D_tensor_argument(idx, _old_scharr_gy, window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
- : _new_input(nullptr)
-{
-}
-
-void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-{
- configure(CLKernelLibrary::get().get_compile_context(), new_input, new_points_internal, coeff_table, old_ival, termination, epsilon, num_iterations, window_dimension, level);
-}
-
-void CLLKTrackerStage1Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table,
- ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
- ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
- _new_input = new_input;
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const ValidRegion &valid_region = new_input->info()->valid_region();
-
- update_window_and_padding(window,
- AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)));
-
- ICLKernel::configure_internal(window);
-
- // Initialize required variables
- const int level0 = (level == 0) ? 1 : 0;
- const int window_size = window_dimension;
- const int window_size_squared = window_dimension * window_dimension;
- const int window_size_half = window_dimension / 2;
- const float eig_const = 1.0f / (2.0f * window_size_squared);
- const cl_float3 border_limits =
- {
- {
- // -1 because we load 2 values at once for bilinear interpolation
- static_cast<cl_float>(valid_region.end(0) - window_size - 1),
- static_cast<cl_float>(valid_region.end(1) - window_size - 1),
- static_cast<cl_float>(valid_region.start(0))
- }
- };
-
- // Set maximum number of iterations used for convergence
- const size_t max_iterations = 1000;
- num_iterations = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
-
- const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "lktracker_stage1");
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, coeff_table->cl_buffer());
- _kernel.setArg(idx++, old_ival->cl_buffer());
- _kernel.setArg<cl_int>(idx++, window_size);
- _kernel.setArg<cl_int>(idx++, window_size_squared);
- _kernel.setArg<cl_int>(idx++, window_size_half);
- _kernel.setArg<cl_int>(idx++, num_iterations);
- _kernel.setArg<cl_float>(idx++, epsilon);
- _kernel.setArg<cl_float3>(idx++, border_limits);
- _kernel.setArg<cl_float>(idx++, eig_const);
- _kernel.setArg<cl_int>(idx++, level0);
- _kernel.setArg<cl_int>(idx++, term_epsilon);
-}
-
-void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Set static tensor arguments. Setting here as allocation might be deferred.
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _new_input, window);
-
- enqueue(queue, *this, window, lws_hint());
-}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.h b/src/core/CL/kernels/CLLKTrackerKernel.h
deleted file mode 100644
index 2d2966854a..0000000000
--- a/src/core/CL/kernels/CLLKTrackerKernel.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLKTRACKERKERNEL_H
-#define ARM_COMPUTE_CLLKTRACKERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the initialization step of LKTracker */
-class CLLKTrackerInitKernel : public ICLKernel
-{
-public:
- /** Initialise the kernel input and output
- *
- * @param[in] old_points Pointer to the @ref ICLKeyPointArray storing old key points
- * @param[in] new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
- * @param[out] old_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint old points
- * @param[out] new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
- * @param[in] use_initial_estimate The flag to indicate whether the initial estimated position should be used
- * @param[in] level The pyramid level
- * @param[in] num_levels The number of pyramid levels
- * @param[in] pyramid_scale Scale factor used for generating the pyramid
- */
- void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
- /** Initialise the kernel input and output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] old_points Pointer to the @ref ICLKeyPointArray storing old key points
- * @param[in] new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
- * @param[out] old_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint old points
- * @param[out] new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
- * @param[in] use_initial_estimate The flag to indicate whether the initial estimated position should be used
- * @param[in] level The pyramid level
- * @param[in] num_levels The number of pyramid levels
- * @param[in] pyramid_scale Scale factor used for generating the pyramid
- */
- void configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
-class CLLKTrackerFinalizeKernel : public ICLKernel
-{
-public:
- /** Initialise the kernel input and output
- *
- * @param[in] new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
- * @param[out] new_points Pointer to the @ref ICLKeyPointArray storing new key points
- */
- void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
- /** Initialise the kernel input and output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
- * @param[out] new_points Pointer to the @ref ICLKeyPointArray storing new key points
- */
- void configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
-class CLLKTrackerStage0Kernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLLKTrackerStage0Kernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
- /** Allow instances of this class to be moved */
- CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
- /** Allow instances of this class to be moved */
- CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
- /** Initialise the kernel input and output
- *
- * @param[in] old_input Pointer to the input old tensor. Data types supported: U8
- * @param[in] old_scharr_gx Pointer to the input scharr X tensor. Data types supported: S16
- * @param[in] old_scharr_gy Pointer to the input scharr Y tensor. Data types supported: S16
- * @param[in] old_points_internal Pointer to the array of CLLKInternalKeypoint old points
- * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
- * @param[out] coeff_table Pointer to the array holding the Spatial Gradient coefficients
- * @param[out] old_ival Pointer to the array holding internal values
- * @param[in] window_dimension The size of the window on which to perform the algorithm
- * @param[in] level The pyramid level
- */
- void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level);
- /** Initialise the kernel input and output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] old_input Pointer to the input old tensor. Data types supported: U8
- * @param[in] old_scharr_gx Pointer to the input scharr X tensor. Data types supported: S16
- * @param[in] old_scharr_gy Pointer to the input scharr Y tensor. Data types supported: S16
- * @param[in] old_points_internal Pointer to the array of CLLKInternalKeypoint old points
- * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
- * @param[out] coeff_table Pointer to the array holding the Spatial Gradient coefficients
- * @param[out] old_ival Pointer to the array holding internal values
- * @param[in] window_dimension The size of the window on which to perform the algorithm
- * @param[in] level The pyramid level
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_old_input;
- const ICLTensor *_old_scharr_gx;
- const ICLTensor *_old_scharr_gy;
-};
-
-/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
-class CLLKTrackerStage1Kernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLLKTrackerStage1Kernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
- /** Allow instances of this class to be moved */
- CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
- /** Allow instances of this class to be moved */
- CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
- /** Initialise the kernel input and output
- *
- * @param[in] new_input Pointer to the input new tensor. Data types supported: U8
- * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
- * @param[in] coeff_table Pointer to the array holding the Spatial Gradient coefficients
- * @param[in] old_ival Pointer to the array holding internal values
- * @param[in] termination The criteria to terminate the search of each keypoint.
- * @param[in] epsilon The error for terminating the algorithm
- * @param[in] num_iterations The maximum number of iterations before terminating the algorithm
- * @param[in] window_dimension The size of the window on which to perform the algorithm
- * @param[in] level The pyramid level
- */
- void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
- /** Initialise the kernel input and output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] new_input Pointer to the input new tensor. Data types supported: U8
- * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
- * @param[in] coeff_table Pointer to the array holding the Spatial Gradient coefficients
- * @param[in] old_ival Pointer to the array holding internal values
- * @param[in] termination The criteria to terminate the search of each keypoint.
- * @param[in] epsilon The error for terminating the algorithm
- * @param[in] num_iterations The maximum number of iterations before terminating the algorithm
- * @param[in] window_dimension The size of the window on which to perform the algorithm
- * @param[in] level The pyramid level
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_new_input;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLLKTRACKERKERNEL_H */
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index d0ac12dcd7..e560f1de4a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,9 +29,9 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +43,31 @@ using namespace misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- PoolingType pool_type = pool_info.pool_type;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ PoolingType pool_type = pool_info.pool_type;
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_size_x = pool_info.pool_size.width;
- const int pool_size_y = pool_info.pool_size.height;
+ const int pool_size_x = pool_info.pool_size.width;
+ const int pool_size_y = pool_info.pool_size.height;
const Size2D pool_size(pool_size_x, pool_size_y);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -72,16 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
{
+ _type = CLKernelType::POOL;
}
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
- auto padding_info = get_padding_info({ input, indices, output });
+ auto padding_info = get_padding_info({input, indices, output});
_input = input;
_output = output;
@@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
add_3D_tensor_argument(idx, _output, slice);
add_3D_tensor_argument(idx, _indices, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index cc96cf1a1f..eb18a46784 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -53,26 +53,33 @@ public:
* @param[in] compile_context The compile context to be used.
* @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[in] indices Tensor containing the offset to store the input elements in the output tensor.
- * @ref opencl::ClPooling with indices should precede this function in order to
+ * @ref CLPoolingLayer with indices should precede this function in order to
* properly reconstruct the output tensor.
* The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[in] output Destination tensor info. Data types supported: Same as @p input.
* @param[in] indices TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
- * @ref opencl::ClPooling with indices should precede this function in order to
+ * @ref CLPoolingLayer with indices should precede this function in order to
* properly reconstruct the output tensor.
* The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info);
// Inherited methods overridden
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index a889df7930..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,9 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -47,39 +50,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- if(output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- // This kernel doesn't need padding
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- if(output != nullptr)
- {
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- return std::make_pair(Status{}, win);
-}
} // namespace
CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
: _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *output, float epsilon)
@@ -87,18 +70,28 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
}
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
_run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+ ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+ input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+
+ if (output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), *input->info());
+ }
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
@@ -106,15 +99,15 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DMEANSTDNORM_HALF");
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
// Create kernel
_kernel = create_kernel(compile_context, "mean_stddev_normalization", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
_config_id = "mean_stddev_normalization_layer_";
@@ -128,7 +121,6 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
Status CLMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
return Status{};
}
@@ -148,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905e..e02a3c58a3 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
* @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
* @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output = nullptr,
+ float epsilon = 1e-8f);
/** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
*
* @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
deleted file mode 100644
index 7017efa3c2..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
- if(output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- TensorShape output_shape = compute_min_max_shape(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- TensorShape output_shape = compute_min_max_shape(input);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
- const unsigned int num_elems_processed_per_iteration = 1;
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output, 0, 0, 2, output->dimension(1));
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
-}
-} // namespace
-
-CLMinMaxLayerKernel::CLMinMaxLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- std::set<std::string> build_opts;
- build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "minmax_layer", build_opts);
-
- auto win_config = validate_and_configure_window(input->info(), output->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
- return Status{};
-}
-
-void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
-{
- _output->map(queue, true);
-
- Window window_output;
- window_output.use_tensor_dimensions(_output->info()->tensor_shape());
- window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator output(_output, window_output);
-
- // Reset output
- execute_window_loop(window_output, [&](const Coordinates &)
- {
- auto *ptr = reinterpret_cast<float *>(output.ptr());
- ptr[0] = std::numeric_limits<float>::max();
- ptr[1] = std::numeric_limits<float>::min();
- },
- output);
-
- _output->unmap(queue);
-}
-
-void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = window_collapsed.first_slice_window_3D();
- slice.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- Window output_slice = slice.shift_dimensions(2);
-
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_1D_tensor_argument(idx, _output, output_slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index aa2ff3f375..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLMinMaxLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
- * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
- * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
- */
- void configure(const ICLTensor *input, ICLTensor *output);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
- * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
- * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
- *
- * @param[in] input Input tensor info. Data types supported: F32.
- * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
- * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
- /** Resets global minimum and maximum
- *
- * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
- */
- void reset(cl::CommandQueue &queue);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index d1982e77b9..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,7 +29,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -37,11 +40,10 @@
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 4;
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -52,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -62,38 +64,66 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
- const bool is_norm_accross_width = norm_idx == 0;
+ bool window_changed = false;
+ Window win;
+ const DataLayout data_layout = input->data_layout();
+ if (data_layout == DataLayout::NCHW)
+ {
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
+ const bool is_norm_across_width = norm_idx == 0;
- const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
- const BorderSize border_size = BorderSize(0, border_width);
+ const unsigned int norm_radius = norm_info.norm_size() / 2;
+ // Border / padding calculation:
+ // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+ // This means the x axis is fully-padded depending on vec_size_x and norm_size
+ // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+ // In : |p|#|#|#|p|p|
+ // Out: |#|#|#|p|
+ // The output has 1 right padding because of the vec_size_x.
+ // The input has 1 left padding because radius = 1.
+ // The input has 2 right padding because of radius = 1 AND because of the extra output padding
+ const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+ const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
+ win = calculate_max_window(*input, Steps(vec_size_x));
- // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
- // Reads can occur within the valid region of the input
- if(is_norm_accross_width)
- {
- AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
- window_changed = window_changed || update_window_and_padding(win, input_access);
+ // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
+ // Reads can occur within the valid region of the input
+ if (is_norm_across_width)
+ {
+ AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+ else
+ {
+ AccessWindowHorizontal input_access(input, -border_size.left, vec_size_x);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+
+ AccessWindowHorizontal output_access(output, 0, vec_size_x);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
}
else
{
- AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access);
+ unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ if (norm_info.is_cross_map())
+ {
+ vec_size_x = 1;
+ }
+ win = calculate_max_window(*input, Steps(vec_size_x));
}
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -101,6 +131,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
CLNormalizationLayerKernel::CLNormalizationLayerKernel()
: _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
BorderSize CLNormalizationLayerKernel::border_size() const
@@ -113,24 +144,51 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
}
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
+ auto padding_info = get_padding_info({input, output});
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+ auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
_input = input;
_output = output;
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
- _is_norm_across_width = norm_idx == 0;
- const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
- _border_size = BorderSize(0, border_width);
+ const DataLayout data_layout = input->info()->data_layout();
+ unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+ {
+ vec_size_x = 1;
+ vec_size_x_leftovers = 0;
+ }
+
+ if (data_layout == DataLayout::NCHW)
+ {
+ const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
+ _is_norm_across_width = norm_idx == 0;
+ const unsigned int norm_radius = norm_info.norm_size() / 2;
+ // Border / padding calculation:
+ // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+ // This means the x axis is fully-padded depending on vec_size_x and norm_size
+ // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+ // In : |p|#|#|#|p|p|
+ // Out: |#|#|#|p|
+ // The output has 1 right padding because of the vec_size_x.
+ // The input has 1 left padding because radius = 1.
+ // The input has 2 right padding because of radius = 1 AND the extra output padding
+ const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+ _border_size = BorderSize(0, border_width_right, 0, border_width_left);
+ }
const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -140,35 +198,29 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
- build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)));
+ build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers)));
build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
- build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+ "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+ "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
// Create kernel
std::string kernel_name;
- if(norm_info.is_in_map())
+ if (norm_info.is_in_map())
{
kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
}
else
{
- if(data_layout == DataLayout::NCHW)
- {
- kernel_name = "normalization_layer_cross_map";
- }
- else
- {
- // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
- kernel_name = "normalization_layer_in_map_nchw";
- }
+ kernel_name = "normalization_layer_cross_map_" + lower_string(string_from_data_layout(data_layout));
}
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
// Set config_id for enabling LWS tuning
@@ -182,12 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ if (data_layout == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
}
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
return Status{};
}
@@ -207,6 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
+ } while (window_collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f1..5517ba6904 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
* Data layouts supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
// Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
+ void run(const Window &window, cl::CommandQueue &queue) override;
BorderSize border_size() const override;
private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 18cbe217be..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,32 +29,37 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -64,11 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -77,16 +79,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- if(input->data_layout() == DataLayout::NHWC)
- {
- AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal std_access(std, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mean_access, std_access);
- }
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -94,36 +89,57 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
: _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
}
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ auto padding_info = get_padding_info({input, output});
+
_input = input;
_output = output;
_mean = mean;
_std = std;
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const DataType dt = input->info()->data_type();
+ const DataLayout data_layout = input->info()->data_layout();
+
+ // Get number of elements to process per iterations
+ const unsigned int num_elems_processed_per_iteration =
+ (data_layout == DataLayout::NHWC)
+ ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+ : (16 / input->info()->element_size());
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const DataType dt = input->info()->data_type();
// Set build options
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
std::string kernel_name = "normalize_planar_yuv_layer_";
- if(is_data_type_quantized(dt))
+ if (is_data_type_quantized(dt))
{
const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -132,13 +148,22 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
}
// Create kernel
- kernel_name += lower_string(string_from_data_layout(input->info()->data_layout()));
+ kernel_name += lower_string(string_from_data_layout(data_layout));
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ if (data_layout == DataLayout::NHWC)
+ {
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
+ else
+ {
+ auto win_config = validate_and_configure_window_nchw(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+ }
// Set config_id for enabling LWS tuning
_config_id = "normalize_planar_yuv_layer_";
@@ -153,11 +178,17 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
_config_id += support::cpp11::to_string(input->info()->dimension(2));
}
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get()).first);
-
+ if (input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+ }
return Status{};
}
@@ -182,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78..341b404e3d 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
* @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
* Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
*
* @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 2f54b390d5..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -34,25 +36,29 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_UNUSED(constant_value);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
- if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+ if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
- for(size_t i = 0; i < padding.size(); ++i)
+ for (size_t i = 0; i < padding.size(); ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
}
}
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
@@ -64,40 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLPadLayerKernel::CLPadLayerKernel()
- : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+ const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
}
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
_4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
// Set build options
- const DataType &data_type = input->info()->data_type();
- const unsigned int input_width = input->info()->dimension(0);
- const unsigned int input_height = input->info()->dimension(1);
- const unsigned int input_depth = input->info()->dimension(2);
- const unsigned int pad_x_before = padding.at(0).first;
- const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
- const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
- const unsigned int vec_size = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
- const unsigned int pad_right_start = input_width + pad_x_before;
- const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
- const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+ const DataType &data_type = input->info()->data_type();
+ const unsigned int input_width = input->info()->dimension(0);
+ const unsigned int input_height = input->info()->dimension(1);
+ const unsigned int input_depth = input->info()->dimension(2);
+ const unsigned int pad_x_before = padding.at(0).first;
+ const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+ const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+ const unsigned int vec_size = adjust_vec_size(
+ std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+ input_width);
+ const unsigned int pad_right_start = input_width + pad_x_before;
+ const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+ const unsigned int vec_size_leftover_write =
+ vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -106,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
- if(padding.size() > 1)
+ if (padding.size() > 1)
{
build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- if(padding.size() > 2)
+ if (padding.size() > 2)
{
build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -119,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
}
std::string kernel_name = "pad_layer_";
- switch(mode)
+ switch (mode)
{
case PaddingMode::CONSTANT:
{
kernel_name += "constant";
- const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+ const unsigned int vec_size_leftover_read =
+ vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
- if(pad_x_before >= vec_size)
+ if (pad_x_before >= vec_size)
{
build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
- build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+ build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+ support::cpp11::to_string(pad_right_start / vec_size));
}
- if(_4d_enabled)
+ if (_4d_enabled)
{
build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -152,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
- const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+ const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
- build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
- build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
- build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+ build_opts.add_option_if(after_pad_fact_x < output_last_x,
+ "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
break;
}
@@ -177,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
return Status{};
@@ -195,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
- if(_4d_enabled)
+ if (_4d_enabled)
{
add_argument<unsigned int>(idx, batch++);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f94..dca121b6a1 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Set the input and output tensor.
*
* @param[in] compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
- PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
*
* @param[in] input Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
deleted file mode 100644
index c68c526ec9..0000000000
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(overflow_policy);
- ARM_COMPUTE_UNUSED(rounding_policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
- "Output can only be QASYMM8 if both inputs are QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
- "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be QSYMM16 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be S32 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output, out_shape);
-
- if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output, Format::S16);
- }
- else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output, Format::F32);
- }
- else if(input1->data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(*output, DataType::QASYMM8);
- }
- else if(input1->data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
- }
- else if(input1->data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(*output, DataType::QSYMM16);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
-}
-
-void CLPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output,
- scale, overflow_policy, rounding_policy, act_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- int scale_int = -1;
- // Extract sign, exponent and mantissa
- int exponent = 0;
- float normalized_mantissa = std::frexp(scale, &exponent);
- // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
- // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
- // Moreover, it will be negative as we deal with 1/2^n
- if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
- {
- // Store the positive exponent. We know that we compute 1/2^n
- // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
- scale_int = std::abs(exponent - 1);
- }
-
- std::string acc_type;
- // Check if it has float inputs and output
- if(is_data_type_float(input1->data_type()) || is_data_type_float(input2->data_type()))
- {
- scale_int = -1;
- acc_type = (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) ? "float" : "half";
- }
- else
- {
- if(input1->element_size() == 2 || input2->element_size() == 2)
- {
- // Use 32-bit accumulator for 16-bit input
- acc_type = "int";
- }
- else
- {
- // Use 16-bit accumulator for 8-bit input
- acc_type = "ushort";
- }
- }
-
- const bool is_quantized = is_data_type_quantized(input1->data_type());
-
- // Set kernel build options
- std::string kernel_name = "pixelwise_mul";
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->data_type()));
- build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- if(is_quantized && (output->data_type() != DataType::S32))
- {
- const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
- build_opts.add_option_if(is_data_type_quantized_asymmetric(input1->data_type()),
- "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(input2->data_type()),
- "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(output->data_type()),
- "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
- build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- kernel_name += "_quantized";
- }
- else
- {
- kernel_name += (scale_int >= 0) ? "_int" : "_float";
- build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->data_type()), "-DWRAP", "-DSATURATE");
- build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
- build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
- if(act_info.enabled())
- {
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- }
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set scale argument
- unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
- if(scale_int >= 0 && !is_quantized)
- {
- _kernel.setArg(idx++, scale_int);
- }
- else
- {
- _kernel.setArg(idx++, scale);
- }
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
- const TensorShape &in_shape1 = src_0->info()->tensor_shape();
- const TensorShape &in_shape2 = src_1->info()->tensor_shape();
- const TensorShape &out_shape = dst->info()->tensor_shape();
-
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, src_0, slice_input1);
- add_3D_tensor_argument(idx, src_1, slice_input2);
- add_3D_tensor_argument(idx, dst, slice);
- enqueue(queue, *this, slice, lws_hint());
-
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseMultiplicationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
-
-Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
- auto_init_if_empty(*output, out_info);
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLComplexPixelWiseMultiplicationKernel::CLComplexPixelWiseMultiplicationKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output, act_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window_complex(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_output->data_type()));
- if(act_info.enabled())
- {
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
- const TensorShape &in_shape1 = src_0->info()->tensor_shape();
- const TensorShape &in_shape2 = src_1->info()->tensor_shape();
- const TensorShape &out_shape = dst->info()->tensor_shape();
-
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, src_0, slice_input1);
- add_3D_tensor_argument(idx, src_1, slice_input2);
- add_3D_tensor_argument(idx, dst, slice);
- enqueue(queue, *this, slice, lws_hint());
-
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 74102fd397..0000000000
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the pixelwise multiplication kernel. */
-class CLPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
- /** Default constructor.*/
- CLPixelWiseMultiplicationKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
- /** Initialise the kernel's input, output and border mode.
- *
- * Valid configurations (Input1,Input2) -> Output :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,U8) -> S16
- * - (S16,S16) -> S16
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
- *
- * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Initialise the kernel's input, output and border mode.
- *
- * Valid configurations (Input1,Input2) -> Output :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,U8) -> S16
- * - (S16,S16) -> S16
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
- *
- * Valid configurations (Input1,Input2) -> Output :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,U8) -> S16
- * - (S16,S16) -> S16
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
- *
- * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- const ITensorInfo *_input1;
- const ITensorInfo *_input2;
- ITensorInfo *_output;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CLComplexPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
- /** Default constructor.*/
- CLComplexPixelWiseMultiplicationKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLComplexPixelWiseMultiplicationKernel(const CLComplexPixelWiseMultiplicationKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLComplexPixelWiseMultiplicationKernel &operator=(const CLComplexPixelWiseMultiplicationKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLComplexPixelWiseMultiplicationKernel(CLComplexPixelWiseMultiplicationKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
- /** Initialise the kernel's input, output and border mode.
- *
- * @param[in] input1 An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[out] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Initialise the kernel's input, output and border mode.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2.
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[out] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
- *
- * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2.
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- const ITensorInfo *_input1;
- const ITensorInfo *_input2;
- ITensorInfo *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 7b9caf0063..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,10 +30,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
// Check variances
const int var_size = info.variances().size();
- if(var_size > 1)
+ if (var_size > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
- for(int i = 0; i < var_size; ++i)
+ for (int i = 0; i < var_size; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
}
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
- if(!info.max_sizes().empty())
+ if (!info.max_sizes().empty())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+ "Max and min sizes dimensions should match");
}
- for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+ "Max size should be greater than min size");
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
}
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ ITensorInfo *output,
+ const PriorBoxLayerInfo &info,
+ int num_priors)
{
ARM_COMPUTE_UNUSED(input2);
// Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
auto_init_if_empty(*output, output_shape, 1, input1->data_type());
const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -99,15 +109,28 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
: _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
}
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
- cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
@@ -134,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
int img_width = info.img_size().x;
int img_height = info.img_size().y;
- if(img_width == 0 || img_height == 0)
+ if (img_width == 0 || img_height == 0)
{
img_width = input2->info()->dimension(width_idx);
img_height = input2->info()->dimension(height_idx);
@@ -142,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
float step_x = info.steps()[0];
float step_y = info.steps()[0];
- if(step_x == 0.f || step_y == 0.f)
+ if (step_x == 0.f || step_y == 0.f)
{
step_x = static_cast<float>(img_width) / layer_width;
step_y = static_cast<float>(img_height) / layer_height;
@@ -161,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
build_opts.add_option_if(info.clip(), "-DIN_PLACE");
- if(info.variances().size() > 1)
+ if (info.variances().size() > 1)
{
- for(unsigned int i = 0; i < info.variances().size(); ++i)
+ for (unsigned int i = 0; i < info.variances().size(); ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(i)));
}
}
else
{
- for(unsigned int i = 0; i < 4; ++i)
+ for (unsigned int i = 0; i < 4; ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(0)));
}
}
@@ -193,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
ICLKernel::configure_internal(win_config.second);
}
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+ output->clone().get(), info, num_priors)
+ .first);
return Status{};
}
@@ -210,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
- queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
- if(!_info.max_sizes().empty())
+ queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+ _info.aspect_ratios().data());
+ if (!_info.max_sizes().empty())
{
queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
}
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e..a50e0c5ff5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
* @param[in] max Maximum prior box values
* @param[in] aspect_ratios Aspect ratio values
*/
- void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+ void configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
* @param[in] max Maximum prior box values
* @param[in] aspect_ratios Aspect ratio values
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
- cl::Buffer *aspect_ratios);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
/** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
*
* @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input1;
- const ICLTensor *_input2;
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
ICLTensor *_output;
PriorBoxLayerInfo _info;
int _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index ccc61783c4..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,12 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -47,15 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
/* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
/* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
- const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+ const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+ ? input->dimension(0)
+ : temp_num_elems_processed_per_iteration;
// This kernel doesn't need padding
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
@@ -71,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -83,12 +91,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
: _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
- auto padding_info = get_padding_info({ input, weight, bias, output });
+ auto padding_info = get_padding_info({input, weight, bias, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
@@ -102,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
int32_t output_multiplier{};
int32_t output_shift{};
const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
- const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+ const Status status =
+ quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
output_shift *= -1;
// Set build options
@@ -112,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
- build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMIN_BOUND=" +
+ support::cpp11::to_string(std::get<0>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMAX_BOUND=" +
+ support::cpp11::to_string(std::get<1>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
// Create kernel
_kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -133,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
}
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -169,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba..ba912e1d2d 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
* @param[in] weight Weight tensor. Data types supported: Same as @p input.
* @param[in] bias Bias tensor. Data types supported: S32.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias);
/** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
*
* @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
deleted file mode 100644
index 76e703f0dd..0000000000
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- // Output must always be initialized
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- return Status{};
-}
-} // namespace
-
-CLQuantizationLayerKernel::CLQuantizationLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- auto padding_info = get_padding_info({ input, output });
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const int vec_size_x = 16 / input->info()->element_size();
- const int input_width_x = input->info()->tensor_shape().x();
- const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
- const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
- const DataType output_data_type = output->info()->data_type();
-
- float scale_to_apply = qinfo.scale;
- int32_t offset_to_apply = qinfo.offset;
- if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
- {
- /*
- * In case of requantization of a quantized input tensor to an output tensor with another quantization
- * instead of of apply dequantization and then a quantization functions, we just compute new scale and
- * offset to apply.
- *
- * Assuming:
- * - q_i as input quantized value
- * - q_o as output quantized value
- * - z_i as input quantization offset value
- * - z_o as output quantization offset value
- * - s_i as input quantization scale value
- * - s_o as output quantization scale value
- * - z_n as new quantization offset value
- * - s_n as new quantization scale value
- *
- * q_o = ( q_i - z_i ) * s_i / s_o + z_o
- *
- * We can rewrite the formula as:
- *
- * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
- *
- * q_o = q_i / s_n + z_n
- *
- * Where:
- *
- * s_n = s_o / s_i
- *
- * z_n = - z_i * s_i / s_o + z_o
- *
- */
- const UniformQuantizationInfo qinfo_in = _input->info()->quantization_info().uniform();
- scale_to_apply /= qinfo_in.scale;
- // In order to minimize flooring we convert the offset to a float,
- // then compute the new offset in the float domain,
- // finally we convert it back as int32_t
- offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
- }
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option_if(is_data_type_float(_input->info()->data_type()), "-DIS_FLOAT");
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
- std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
- build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
- build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
- _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- if(multi_access_x)
- {
- win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- return Status{};
-}
-
-void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.h b/src/core/CL/kernels/CLQuantizationLayerKernel.h
deleted file mode 100644
index e9d03decb3..0000000000
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class CLQuantizationLayerKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLQuantizationLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLQuantizationLayerKernel(const CLQuantizationLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLQuantizationLayerKernel &operator=(const CLQuantizationLayerKernel &) = delete;
- /** Default Move Constructor. */
- CLQuantizationLayerKernel(CLQuantizationLayerKernel &&) = default;
- /** Default move assignment operator */
- CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
- /** Default destructor */
- ~CLQuantizationLayerKernel() = default;
- /** Set the input, output.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
- *
- * @note Output auto initialization is not supported by this kernel
- */
- void configure(const ICLTensor *input, ICLTensor *output);
- /** Set the input, output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
- *
- * @note Output auto initialization is not supported by this kernel
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * @param[in] output Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 87f4a5d7f3..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,13 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,24 +43,29 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+ output->tensor_shape());
}
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
@@ -81,14 +85,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
CLROIAlignLayerKernel::CLROIAlignLayerKernel()
: _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -98,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
output->info()->set_data_layout(input->info()->data_layout());
- auto padding_info = get_padding_info({ input, rois, output });
+ auto padding_info = get_padding_info({input, rois, output});
_input = input;
_output = output;
@@ -112,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
- build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
- build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+ build_opts.add_option("-DMAX_DIM_X=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+ build_opts.add_option("-DMAX_DIM_Y=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+ build_opts.add_option("-DMAX_DIM_Z=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
- build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+ build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+ "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
- if(is_qasymm)
+ if (is_qasymm)
{
const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -145,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index cbf0e00165..2e84e5d303 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,7 @@
#ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
#define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
-#include "arm_compute/core/CL/ICLArray.h"
#include "src/core/CL/ICLKernel.h"
-
namespace arm_compute
{
class ICLTensor;
@@ -63,7 +61,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -79,7 +78,11 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -95,7 +98,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 43492a3d50..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,82 +25,82 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "src/core/AccessWindowStatic.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
+#include <cfloat>
#include <cmath>
-#include <set>
#include <string>
namespace arm_compute
{
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
+ : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto initialization if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
- auto_init_if_empty((*output), output_shape, 1, input->data_type());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ _type = CLKernelType::ELEMENTWISE;
}
-} // namespace
-CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
- : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+
+ //Validate arguments
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+ (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+ }
+
+ return Status{};
}
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
- //Validate arguments
- ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
- ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
- ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
- ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-
- if(output->info()->total_size() != 0)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
- }
+ auto padding_info = get_padding_info({input, rois, output});
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+ rois->info()->dimension(1));
+ auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+ output->info()->quantization_info());
// Set instance variables
_input = input;
@@ -108,27 +108,46 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
_output = output;
_pool_info = pool_info;
+ const DataType data_type = input->info()->data_type();
+ const bool is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
- build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
- build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
- build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
- build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
- build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(data_type));
+ build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+ build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+ build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+ build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+ build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+ build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
+
+ if (is_qasymm)
+ {
+ // Determine quantization info scale, offset
+ UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
+ uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+ _output->info()->quantization_info().uniform());
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
+
+ // Specify minimum possible value of datatype
+ build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(0));
+ }
+ else
+ {
+ // Specify min value of F32 datatype
+ build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(-FLT_MAX));
+ }
+
+ Window win = calculate_max_window(*(output->info()), Steps());
+ ICLKernel::configure_internal(win);
// Create kernel
std::string kernel_name = "roi_pooling_layer";
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
- add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
- add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 35f42a9676..80bfb63092 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,9 +25,6 @@
#define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
#include "src/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/ICLArray.h"
-
namespace arm_compute
{
class ICLTensor;
@@ -62,11 +59,12 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: F16/F32.
+ * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8
* @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
* as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
* @param[out] output Destination tensor. Data types supported: Same as @p input.
@@ -77,15 +75,37 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
+ /** Static Validate function to check inputs will lead to valid configuration of @ref CLROIPoolingLayer
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
+
private:
const ICLTensor *_input;
const ICLTensor *_rois;
- ICLTensor *_output;
+ const ICLTensor *_output;
ROIPoolingLayerInfo _pool_info;
};
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 85f79988c9..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -40,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::S8, DataType::QASYMM8,
- DataType::U16, DataType::S16,
- DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
@@ -54,20 +54,24 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+ "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+ "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+ "step value is outside the range of the data type");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+ "Output tensor size is incorrect");
return Status{};
}
} // namespace
-CLRangeKernel::CLRangeKernel()
- : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
@@ -75,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
}
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+ const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
// Configure kernel window
- unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- auto padding_info = get_padding_info({ output });
+ auto padding_info = get_padding_info({output});
_start = start;
_end = end;
@@ -97,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
- if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(output->info()->data_type()))
{
const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
@@ -135,4 +142,4 @@ void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
enqueue(queue, *this, window, lws_hint());
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed..65251a11e5 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLRANGEKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 2697a0df98..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,42 +28,47 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-// OpenCL kernel requires input width to be a power of 2 for x-axis.
-constexpr unsigned int border_val = 64;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- if(input->num_channels() == 1)
+ if (input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+ "Not supported reduction operation for QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (width == 0) && (input->data_type() != DataType::QASYMM8) && (input->data_type() != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+ ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+ (input->data_type() != DataType::QASYMM8) &&
+ (input->data_type() != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+ "Not supported reduction operation, use CLArgMinMaxLayer");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -71,85 +76,50 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
return Status{};
}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
- // Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true);
- DataType output_data_type = input->data_type();
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
- const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
- const bool is_serial_op = needs_serialized_reduction(op, input->data_type(), axis);
-
- switch(axis)
- {
- case 0:
- {
- if(!is_serial_op)
- {
- const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
- AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1);
- AccessWindowHorizontal output_access(output, 0, 1);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- }
- break;
- case 1:
- case 2:
- case 3:
- {
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
- return std::make_tuple(err, win);
-}
} // namespace
CLReductionOperationKernel::CLReductionOperationKernel()
- : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-BorderSize CLReductionOperationKernel::border_size() const
+void CLReductionOperationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- return _border_size;
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, width);
-}
-
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
_reduction_axis = axis;
_op = op;
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+
// Set build options
CLBuildOptions build_opts;
DataType data_type = input->info()->data_type();
std::string data_type_promoted{};
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
data_type_promoted = "int";
}
@@ -158,8 +128,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
data_type_promoted = get_cl_type_from_data_type(data_type);
}
+ const unsigned int width = input->info()->dimension(0) * input->info()->num_channels();
+ unsigned int vec_size = (is_data_type_quantized(input->info()->data_type()) && (axis == 0)) ? 1 : 16;
+ vec_size = adjust_vec_size(vec_size, width);
+ const unsigned int vec_size_leftover = width % vec_size;
+
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
build_opts.add_option_if(is_data_type_float(data_type), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
@@ -167,11 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
- build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
- switch(op)
+ build_opts.add_option_if(is_data_type_quantized(data_type),
+ "-DOFFSET=" +
+ support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+ build_opts.add_option_if(
+ is_data_type_quantized(data_type),
+ "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+ switch (op)
{
case ReductionOperation::SUM_SQUARE:
build_opts.add_option(("-DOPERATION=square_sum"));
@@ -181,7 +161,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option(("-DOPERATION=sum"));
break;
case ReductionOperation::MIN:
+ build_opts.add_option(("-DOPERATION=min_"));
+ break;
case ReductionOperation::MAX:
+ build_opts.add_option(("-DOPERATION=max_"));
break;
case ReductionOperation::PROD:
build_opts.add_option(("-DOPERATION=product"));
@@ -191,30 +174,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
}
// Create kernel
- cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(axis)
+ switch (axis)
{
case 0:
{
- if(is_serial_op)
- {
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
- kernel_axis_name = "non_parallel_x";
- }
- else
- {
- build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
- const unsigned int width_leftover = input->info()->dimension(0) % border_val;
- const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0;
- kernel_axis_name = "x";
-
- lws_hint = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val);
- _border_size = BorderSize(0, border_width, 0, 0);
- }
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
+ kernel_axis_name = ((is_serial_op) ? "non_parallel_x" : "x");
}
break;
case 1:
@@ -236,18 +204,21 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
_kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+ TensorShape actual_input_shape = input->info()->tensor_shape();
+ actual_input_shape[0] = width;
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
+ ICLKernel::configure_internal(win);
- ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
}
@@ -257,18 +228,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
// We use parallel reduction only in non quantized types
- if(is_serial_op)
+ if (is_serial_op)
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ Window window_in{window};
+ window_in.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
- Window out_window{ window };
+ Window out_window{window};
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
Window in_slice = window_in.first_slice_window_1D();
@@ -279,91 +251,114 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
unsigned int idx = 0;
add_1D_tensor_argument(idx, _input, in_slice);
add_1D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ enqueue(queue, *this, in_slice);
+ } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
else
{
// Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+ bool has_collapsed = true;
+ Window window_in = window.collapse_if_possible(window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
- // Set local sums buffer
- unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
+ Window window_out = window_in;
+ window_out.set(0, Window::Dimension());
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, window_in);
+ add_3D_tensor_argument(idx, _output, window_out);
+ enqueue(queue, *this, window_in);
}
}
break;
case 1:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
- Window in_slice = window_in.first_slice_window_2D();
- Window out_slice = window.first_slice_window_2D();
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 2:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
- Window in_slice = window_in.first_slice_window_3D();
- Window out_slice = window.first_slice_window_3D();
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 3:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(3, Window::Dimension(0, 1, 1));
- Window in_slice = window_in.first_slice_window_4D();
- Window out_slice = window.first_slice_window_4D();
+ bool has_collapsed = true;
+ Window actual_window = window.shift_dimensions(1, Window::DimW);
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, in_slice);
- add_4D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index ff9fd61484..2f94b2add3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -56,9 +57,8 @@ public:
* Output will have the same number of dimensions as input.
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
- * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image.
*/
- void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+ void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -67,9 +67,12 @@ public:
* Output will have the same number of dimensions as input.
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
- * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
/** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
*
@@ -78,22 +81,20 @@ public:
* Output will have the same number of dimensions as input.
* @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
- * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
private:
const ICLTensor *_input;
ICLTensor *_output;
unsigned int _reduction_axis;
ReductionOperation _op;
- BorderSize _border_size;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
deleted file mode 100644
index 0ebeefcc74..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLRemapKernel::CLRemapKernel()
- : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
-{
-}
-
-BorderSize CLRemapKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_undefined);
-}
-
-void CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
- ARM_COMPUTE_UNUSED(border_undefined);
-
- _input = input;
- _output = output;
- _map_x = map_x;
- _map_y = map_y;
-
- // Create kernel
- std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
- std::string interpolation_name = string_from_interpolation_policy(policy);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "remap_" + interpolation_name;
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- const int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
- const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
- Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set static arguments
- unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
- _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
-}
-
-void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- add_2D_tensor_argument(idx, _map_x, slice);
- add_2D_tensor_argument(idx, _map_y, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index 8efcf091ed..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLRemapKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLRemapKernel(const CLRemapKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLRemapKernel &operator=(const CLRemapKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLRemapKernel(CLRemapKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLRemapKernel &operator=(CLRemapKernel &&) = default;
- /** Initialize the kernel's input, output and border mode.
- *
- * @param[in] input Source tensor. Data types supported: U8.
- * @param[in] map_x Map for X coordinates. Data types supported: F32.
- * @param[in] map_y Map for Y coordinates. Data types supported: F32.
- * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
- * @param[in] policy The interpolation type.
- * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant.
- */
- void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
- /** Initialize the kernel's input, output and border mode.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: U8.
- * @param[in] map_x Map for X coordinates. Data types supported: F32.
- * @param[in] map_y Map for Y coordinates. Data types supported: F32.
- * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
- * @param[in] policy The interpolation type.
- * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- const ICLTensor *_map_x;
- const ICLTensor *_map_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 01853450ee..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -50,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+ "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+ "The height of the input tensor must be a multiple of stride");
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ const TensorInfo tensor_info_output =
+ output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -65,9 +70,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLReorgLayerKernel::CLReorgLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
@@ -75,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
}
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t stride)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
- const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ std::string kernel_name =
+ std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+ const size_t idx_channel =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
// Create kernel
CLBuildOptions build_opts;
@@ -96,12 +106,13 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
// Configure window
// auto inizialize the output tensor if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
Window win = calculate_max_window(*output->info(), Steps());
// The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
_config_id = kernel_name;
@@ -118,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+ const arm_compute::ITensorInfo *output,
+ int32_t stride)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
@@ -138,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6..f335071e9f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index b3c9bcafd1..00241b161b 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,9 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +40,21 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
{
+ ARM_COMPUTE_UNUSED(use_inverted_axis);
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+ "Current implementation only supports up to 4 dimensions.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,20 +65,27 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLReverseKernel::CLReverseKernel()
- : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
}
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
- auto padding_info = get_padding_info({ input, output, axis });
+ auto padding_info = get_padding_info({input, output, axis});
_input = input;
_output = output;
@@ -80,12 +94,14 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+ build_opts.add_option("-DRANK=" + support::cpp11::to_string(input->info()->num_dimensions()));
+ build_opts.add_option_if(use_inverted_axis, "-DUSE_INVERTED_AXIS");
// Create kernel
_kernel = create_kernel(compile_context, "reverse", build_opts.options());
@@ -113,9 +129,12 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
return Status{};
}
@@ -135,7 +154,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_1D_tensor_argument(idx, _axis, axis_slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f802..a630aec15a 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
#include "src/core/CL/ICLKernel.h"
@@ -48,29 +48,43 @@ public:
~CLReverseKernel() = default;
/** Initialise the kernel's inputis and output
*
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+ *
+ * @note The value of each axis should be between [-rank, rank)
+ * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+ *
+ * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+ *
*/
- void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
/** Initialise the kernel's inputis and output
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis);
/** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
*
- * @param[in] input Input tensor info. Data types supported: All.
- * @param[in] output Output tensor info. Data type supported: Same as @p input
- * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] input Input tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -81,4 +95,4 @@ public:
const ICLTensor *_axis;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
deleted file mode 100644
index c2c78c8f6d..0000000000
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include "src/core/utils/ScaleUtils.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-inline std::pair<float, float> calculate_scale_factors(const ITensorInfo &input, const ITensorInfo &output, const ScaleKernelInfo &info)
-{
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input.data_layout() : info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source width/height and destination width/height
- const unsigned int input_width = input.dimension(idx_width);
- const unsigned int input_height = input.dimension(idx_height);
- const unsigned int output_width = output.dimension(idx_width);
- const unsigned int output_height = output.dimension(idx_height);
-
- float wr = arm_compute::scale_utils::calculate_resize_ratio(input_width, output_width, info.align_corners);
- float hr = arm_compute::scale_utils::calculate_resize_ratio(input_height, output_height, info.align_corners);
-
- return std::make_pair(wr, hr);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output == input);
- ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
- float wr = 0.f;
- float hr = 0.f;
- std::tie(wr, hr) = calculate_scale_factors(*input, *output, info);
-
- ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize &border)
-{
- Window win{};
- bool window_changed{};
- unsigned int num_elems_processed_per_iteration = 0;
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : info.data_layout;
-
- switch(data_layout)
- {
- case DataLayout::NCHW:
- {
- if(info.border_mode == BorderMode::UNDEFINED)
- {
- border = BorderSize(0);
- }
-
- num_elems_processed_per_iteration = 4;
- // Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input,
- -border.left, -border.top,
- input->dimension(0) + border.right,
- input->dimension(1) + border.bottom);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
- output->tensor_shape(),
- info.interpolation_policy,
- info.sampling_policy,
- info.border_mode == BorderMode::UNDEFINED));
-
- window_changed = update_window_and_padding(win, input_access, output_access);
- }
- break;
- case DataLayout::NHWC:
- {
- // Configure kernel window
- win = calculate_max_window(*output, Steps());
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CLScaleKernel::border_size() const
-{
- return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
-}
-
-Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : info.data_layout;
- BorderSize border = BorderSize(static_cast<size_t>(data_layout == DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info, border).first);
-
- return Status{};
-}
-
-const ICLTensor *CLScaleKernel::input() const
-{
- return _input;
-}
-
-const ICLTensor *CLScaleKernel::output() const
-{
- return _output;
-}
-
-void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
- auto padding_info = get_padding_info({ input, output });
-
- _input = input;
- _output = output;
- _interpolation_policy = info.interpolation_policy;
- _data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
- _align_corners = info.align_corners;
-
- float wr = 0.f;
- float hr = 0.f;
- std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info(), info);
-
- const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && _interpolation_policy == InterpolationPolicy::BILINEAR;
-
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const bool is_nhwc = _data_layout == DataLayout::NHWC;
-
- // Compute actual border size
- BorderSize border = border_size();
-
- auto interpolation_policy_to_use = _interpolation_policy;
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- if(_interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
- {
- interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
- }
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), info, border);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, input->info()->data_type()));
- build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
- build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
- build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
- build_opts.add_option_if(_align_corners, "-DALIGN_CORNERS");
- if(call_quantized_kernel)
- {
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
- }
- std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "scale_" + interpolation_name;
- kernel_name += call_quantized_kernel ? "_quantized_" : "_";
- kernel_name += lower_string(string_from_data_layout(_data_layout));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-
- const unsigned int input_width = input->info()->dimension(idx_width);
- const unsigned int input_height = input->info()->dimension(idx_height);
-
- _kernel.setArg<float>(idx++, input_width);
- _kernel.setArg<float>(idx++, input_height);
- _kernel.setArg<float>(idx++, wr);
- _kernel.setArg<float>(idx++, hr);
-
- // Set config_id for enabling LWS tuning
- _config_id = "scale_";
- _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
- _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
- _config_id += (is_nhwc ? "nhwc" : "nchw");
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(3));
- if(is_nhwc)
- {
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
- }
-}
-
-void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- break;
- }
- case DataLayout::NHWC:
- {
- Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_4D();
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLScaleKernel.h b/src/core/CL/kernels/CLScaleKernel.h
deleted file mode 100644
index a72e3938d9..0000000000
--- a/src/core/CL/kernels/CLScaleKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCALEKERNEL_H
-#define ARM_COMPUTE_CLSCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the scale kernel */
-class CLScaleKernel : public ICLSimple2DKernel
-{
-public:
- /** Initialise the kernel's inputs, output and interpolation policy
- *
- * @param[in] input Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
- * @param[out] output Destination tensor. Data types supported: Same as @p input
- * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
- /** Initialise the kernel's inputs, output and interpolation policy
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
- * @param[out] output Destination tensor. Data types supported: Same as @p input
- * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
- *
- * @param[in] input Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
- * @param[in] output Destination tensor info. Data types supported: Same as @p input
- * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to validate
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
- /** Input tensor accessor.
- *
- * @return Pointer to input tensor.
- */
- const ICLTensor *input() const;
- /** Output tensor accessor.
- *
- * @return Pointer to output tensor.
- */
- const ICLTensor *output() const;
-
- // Inherited methods overridden:
- BorderSize border_size() const override;
- void run(const Window &window, cl::CommandQueue &queue) override;
-
- // Getter for interpolation policy
- InterpolationPolicy get_interpolation_policy() const
- {
- return _interpolation_policy;
- }
-
-private:
- InterpolationPolicy _interpolation_policy = InterpolationPolicy::BILINEAR;
- DataLayout _data_layout = DataLayout::UNKNOWN;
- bool _align_corners = false;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index f8e63ddc43..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,10 +29,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
namespace arm_compute
@@ -50,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
- ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+ ((c->tensor_shape().num_dimensions() > 1) ||
+ (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -62,12 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
}
} // namespace
-CLSelectKernel::CLSelectKernel()
- : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -78,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
_output = output;
_has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
- auto padding_info = get_padding_info({ c, x, y, output });
+ auto padding_info = get_padding_info({c, x, y, output});
const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
@@ -90,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
// Create kernel
std::string kernel_name = "select";
- if(_has_same_rank)
+ if (_has_same_rank)
{
kernel_name += "_same_rank";
}
else
{
const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
- if(is_input_rank_greater_than_two)
+ if (is_input_rank_greater_than_two)
{
const size_t width = x->info()->tensor_shape().x();
const size_t height = x->info()->tensor_shape().y();
@@ -126,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
return Status{};
@@ -140,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = collapsed.first_slice_window_3D();
- if(!_has_same_rank)
+ if (!_has_same_rank)
{
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -151,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
do
{
unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
- if(_has_same_rank)
+ if (_has_same_rank)
{
add_3D_tensor_argument(idx, _c, slice);
}
@@ -160,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf..c4256fd743 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSELECTKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -60,7 +61,11 @@ public:
* @param[out] y Second input tensor. Data types supported: Same as @p x
* @param[in] output Output tensor. Data types supported: Same as @p x.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
*
* @param[in] c Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
deleted file mode 100644
index 526d9e187d..0000000000
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
-
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- * it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
- // Number of integer bits in temporary fixed-point representation of current-to-max difference
- static const int scaled_diff_int_bits = 5;
- // Number of integer bits used in temporary fixed-point representation of exponent accumulator
- static const int exp_accumulation_in_bits = 12;
-
- const double beta_multiplier = std::min(
- 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
- (1LL << 31) - 1.0);
- int input_beta_multiplier;
- int input_beta_left_shift;
- quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
- const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
- const int diff_min = -1.f * std::floor(max_input_rescaled);
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
- build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
- build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
- build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
- build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
- return build_opts;
-}
-
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
-
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- if(is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- // Checks performed when sum is configured
- if(sum->total_size() != 0)
- {
- if(is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
- }
-
- return Status{};
-}
-
-Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
- ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
-
- // Note: output should always have a scale of 1/256 and offset 0
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- if(!is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
- }
- }
-
- return Status{};
-}
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
- : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, max, output, sum, info);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
- auto padding_info = get_padding_info({ input, max, output, sum });
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
- auto_init_if_empty(*output->info(), *input->info()->clone());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
- _input = input;
- _max = max;
- _output = output;
- _sum = sum;
-
- const DataType dt = input->info()->data_type();
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const size_t reduction_dim_size = input->info()->dimension(0);
- const float beta = info.beta;
- const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
- const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
- ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
- const unsigned int vector_size = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
- build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
- build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
- build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
- build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
- build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
- build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
- build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
- build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
- cl::NDRange lws_hint(cl::NullRange);
- std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
-
- // Configure parallel kernel if needed
- if(std::get<0>(parallel_reduction_info))
- {
- kernel_name += "parallel";
- bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
- build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
-
- // Handle boundary conditions.
- const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
- build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
- // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
- // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
- lws_hint = cl::NDRange(_grid_size);
- }
- else
- {
- kernel_name += "serial";
- }
-
- // Create kernel.
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure window
- Window win = calculate_max_window(*(input->info()), Steps(reduction_dim_size));
- ICLKernel::configure_internal(win, lws_hint);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
- return Status{};
-}
-
-CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
- bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
- unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
- return std::make_tuple(is_parallel_reduction, vector_size);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Collapse window in Z dimension
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Reconfigure window in case of parallel reduction
- ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
- if(std::get<0>(parallel_reduction_info))
- {
- // Launch grid_size parallel work items
- window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
- }
-
- // Get slices
- Window slice = window_collapsed.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _max, slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_3D_tensor_argument(idx, _sum, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-CLLogits1DNormKernel::CLLogits1DNormKernel()
- : _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, info);
-}
-
-void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-
- auto padding_info = get_padding_info({ input, output, sum });
-
- // Note: output should always have a scale of 1/256 and offset 0
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
- const DataType output_data_type = info.input_data_type;
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(),
- input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(input->info(), sum->info(), output->info(), info));
-
- _input = input;
- _sum = sum;
- _output = output;
-
- const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
- const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
- const unsigned int vector_size = adjust_vec_size(16, input->info()->dimension(0));
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
- build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
- build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
- build_opts.add_options_if(is_quantized_asymmetric,
- prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
- build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
- // Create kernel
- std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure window
- auto win = calculate_max_window(*(input->info()), Steps(vector_size));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output, info));
-
- return Status{};
-}
-
-void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- Window sum_slice = slice;
- sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _sum, sum_slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.h b/src/core/CL/kernels/CLSoftmaxLayerKernel.h
deleted file mode 100644
index 29e0f63e46..0000000000
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLSimple3DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for max, shifting, exponentiating and summing the logits */
-class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
-{
-public:
- /** Info for whether a parallel reduction will be run and the vector size of the execution. */
- using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-public:
- /** Default constructor */
- CLLogits1DMaxShiftExpSumKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
- /** Set the input and output tensors.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in,out] max Max values tensor. Data types supported: same as @p input
- * @param[out] output Destination tensor. Data types supported: same as @p input
- * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- */
- void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
- /** Set the input and output tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in,out] max Max values tensor. Data types supported: same as @p input
- * @param[out] output Destination tensor. Data types supported: same as @p input
- * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in] max Max values tensor. Data types supported: same as @p input
- * @param[in] output Destination tensor. Data types supported: same as @p input
- * @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
- /** Checks if the given size is eligible for parallel reduction
- *
- * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size).
- * @note Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
- *
- * @param[in] size Size to check
- *
- * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
- * while the second element is the vector size of the execution.
- */
- static ParallelReductionInfo is_parallel_reduction(size_t size);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_max;
- ICLTensor *_output;
- ICLTensor *_sum;
-
-private:
- static const unsigned int _grid_size;
- static const unsigned int _serial_vector_size;
- static const unsigned int _parallel_vector_size;
-};
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class CLLogits1DNormKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLLogits1DNormKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
- /** Set the input and output tensors.
- *
- * @param[in] input Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
- * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
- * @param[out] output Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- */
- void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
- /** Set the input and output tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
- * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
- * @param[out] output Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
- *
- * @param[in] input Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
- * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
- * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_sum;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 57f7af488b..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,19 +39,22 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *block_info,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -60,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -69,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+ TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input, block_shape_x, block_shape_y, padding_left, padding_right);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -84,18 +94,27 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
: _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
- auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ auto padding_info = get_padding_info({input, block_shape, paddings, output});
_input = input;
_block_shape = block_shape;
@@ -109,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
- _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -124,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+ padding_right, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+ padding_right, output->info()));
_input = input;
_output = output;
@@ -151,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -164,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
- _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(
+ compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
return Status{};
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -216,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2..f9dce9db47 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -63,7 +64,11 @@ public:
* @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output);
/** Initialise the kernel's input and output. (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+ void configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Initialise the kernel's input and output. (Static block shape and paddings)
*
* @param[in] compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 4e5b417ec6..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -63,9 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -73,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, output });
+ auto padding_info = get_padding_info({input, output});
TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -92,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -134,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6..d0932919e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -61,7 +62,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 9bdcc8dc3f..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,10 +30,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_stack_shape(*input, axis, num_tensors));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,17 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
}
} // namespace
-CLStackLayerKernel::CLStackLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
}
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -111,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
_kernel.setArg<cl_uint>(idx, idx_input);
}
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90..d3c17f529c 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -60,7 +61,8 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+ void configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
/** Initialise the kernel's inputs and output
*
* @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
*
* @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 75fd01df14..20cd835069 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,10 +22,13 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/bit_ops.h"
@@ -36,9 +39,14 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -47,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
- {
- return i == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
// Get expected output shape
- const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
// Checks output if configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -68,46 +73,42 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
+} // namespace
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+CLStridedSliceKernel::CLStridedSliceKernel()
{
- // Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
- // Create window
- Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
- return std::make_pair(Status{}, win);
+ _type = CLKernelType::ELEMENTWISE;
}
-} // namespace
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- auto padding_info = get_padding_info({ input, output });
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ auto padding_info = get_padding_info({input, output});
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
const TensorShape &input_shape = input->tensor_shape();
Coordinates starts_abs;
Coordinates ends_abs;
Coordinates final_strides;
- std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
- input_shape,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ std::tie(starts_abs, ends_abs, final_strides) =
+ arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
- auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+ Window win = calculate_max_window(*output, Steps());
// Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
const int vec_size_x = 16 / input->element_size();
@@ -116,29 +117,31 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
// Update window if needed
- if(multi_access_x)
+ if (multi_access_x)
{
- Window &updated_window = std::get<1>(win_config);
+ Window &updated_window = win;
updated_window.set(Window::DimX,
- Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+ Window::Dimension(updated_window.x().start(),
+ ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
}
- ICLKernel::configure_internal(win_config.second);
+ ICLKernel::configure_internal(win);
// Create build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
- build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
- build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+ build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(starts_abs[i]));
+ build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(final_strides[i]));
build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
}
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+ std::max<int>(output_width_x - vec_size_x, 0)));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
- "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
- "-DSRC_DEPTH=1");
build_opts.add_option_if_else(output->num_dimensions() > 2,
"-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
"-DDST_DEPTH=1");
@@ -150,7 +153,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
_config_id = "strided_slice";
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->data_type()));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
_config_id += "_";
_config_id += support::cpp11::to_string(input->dimension(i));
@@ -164,14 +167,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
- starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
return Status{};
}
@@ -181,8 +187,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
- auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = window_collapsed.first_slice_window_4D();
@@ -193,7 +200,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
add_4D_tensor_argument(idx, src, slice);
add_4D_tensor_argument(idx, dst, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_4D(slice));
+ } while (window_collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 599cf34c39..1cf5bcacec 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/CL/ICLKernel.h"
#include <cstdint>
@@ -35,6 +36,9 @@ namespace arm_compute
class CLStridedSliceKernel : public ICLKernel
{
public:
+ /** Default constructor */
+ CLStridedSliceKernel();
+
/** Configure kernel
*
* @note Supported tensor rank: up to 4
@@ -50,9 +54,15 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
*
@@ -68,9 +78,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index c0c3d2e2ee..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,11 @@
* SOFTWARE.
*/
#include "src/core/CL/kernels/CLTileKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
@@ -38,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
- {
- return e == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -54,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLTileKernel::CLTileKernel()
- : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
@@ -64,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
}
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -78,11 +82,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
_input = input;
_output = output;
- const DataType data_type = input->info()->data_type();
- const int vec_size_x = 16 / input->info()->element_size();
- const int input_width_x = input->info()->tensor_shape().x();
- const unsigned int offset = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
- const bool multi_access_x = (input_width_x / vec_size_x > 0);
+ const DataType data_type = input->info()->data_type();
+ const int vec_size_x = 16 / input->info()->element_size();
+ const int input_width_x = input->info()->tensor_shape().x();
+ const unsigned int input_width_ceil = ceil_to_multiple(input_width_x, vec_size_x);
+ const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+ const unsigned int offset = input_width_ceil - input_width_x;
+ const bool multi_access_x = (input_width_x / vec_size_x > 0);
// Create kernel
CLBuildOptions build_opts;
@@ -94,20 +100,20 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
_kernel = create_kernel(compile_context, "tile", build_opts.options());
// Configure window without padding
Window win = calculate_max_window(*output->info());
- if(multi_access_x)
+ if (multi_access_x)
{
// If multi-access is enabled, no thread should cross the tile boundaries. This means we need
// as many threads as those to cover a single tile times multiples[0]. Note that if threads
// do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
// we don't need to pad the output
const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+ win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
}
ICLKernel::configure_internal(win);
@@ -116,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
_config_id = "tile";
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->info()->data_type()));
- for(unsigned int i = 0; i < multiples.size(); ++i)
+ for (unsigned int i = 0; i < multiples.size(); ++i)
{
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -145,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b..c3486aecef 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
* @param[out] output Destination tensor. Same as @p input
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples);
/** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
*
* @param[in] input Source tensor info. Data type supported: All.
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
deleted file mode 100644
index 56ff48be1f..0000000000
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLTransposeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape transposed_tensor_shape(const TensorShape &in)
-{
- TensorShape output_shape{ in };
- const size_t w_out = in[1];
- const size_t h_out = in[0];
- output_shape.set(0, w_out);
- output_shape.set(1, h_out);
-
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-Status CLTransposeKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- return Status{};
-}
-
-void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLTransposeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
- auto padding_info = get_padding_info({ input, output });
-
- _input = input;
- _output = output;
-
- const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
- const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
- const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(1));
- const int vec_size_y_leftovers = input->info()->dimension(1) % vec_size_y;
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(input->info()->element_size()));
- build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
- build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
- build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
-
- _kernel = create_kernel(compile_context, "transpose", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(vec_size_x, vec_size_y));
- ICLKernel::configure_internal(win, cl::NDRange(2, 8));
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLTransposeKernel.h b/src/core/CL/kernels/CLTransposeKernel.h
deleted file mode 100644
index 0c4b7b4aff..0000000000
--- a/src/core/CL/kernels/CLTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-
-#include "src/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class CLTransposeKernel : public ICLSimple2DKernel
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- */
- void configure(const ICLTensor *input, ICLTensor *output);
- /** Initialise the kernel's input and output.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
- *
- * @param[in] input Input tensor. Data types supported: All.
- * @param[in] output Output tensor. Data type supported: Same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTRANSPOSEKERNEL_H */
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
deleted file mode 100644
index 559f47ce26..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-CLWeightsReshapeKernel::CLWeightsReshapeKernel()
- : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, biases, output, num_groups);
-}
-
-void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr), num_groups)));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(), num_groups));
-
- auto padding_info = get_padding_info({ input, biases, output });
-
- const DataType data_type = input->info()->data_type();
-
- _biases = biases;
- _output = output;
- _input = input;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
- build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
- build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
-
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps());
- // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure_internal(win);
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output, num_groups));
- return Status{};
-}
-
-void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window in_slice = window.first_slice_window_3D();
- Window out_slice = out_window.first_slice_window_2D();
-
- Window biases_window;
- Window biases_slice;
-
- unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
- idx += (_biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
- _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
-
- if(_biases != nullptr)
- {
- biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
- biases_slice = biases_window.first_slice_window_1D();
- }
-
- do
- {
- // Set arguments
- unsigned idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- if(_biases != nullptr)
- {
- add_1D_tensor_argument(idx, _biases, biases_slice);
- ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
- }
-
- // Run kernel
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.h b/src/core/CL/kernels/CLWeightsReshapeKernel.h
deleted file mode 100644
index 402a60472b..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLWeightsReshapeKernel : public ICLKernel
-{
-public:
- /** Constructor.*/
- CLWeightsReshapeKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
- /** Default destructor */
- ~CLWeightsReshapeKernel() = default;
- /** Set the input and output of the kernel.
- *
- * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All
- * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
- * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[out] output The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
- * Data types supported: Same as @p input
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
- */
- void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
- /** Set the input and output of the kernel.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All
- * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
- * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[out] output The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
- * Data types supported: Same as @p input
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
- *
- * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All
- * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
- * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[in] output The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
- * Data types supported: Same as @p input
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_biases;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
deleted file mode 100644
index bd45ddb65f..0000000000
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
-
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(output);
-
- const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
- const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
- const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
- Window win_collapsed = win.collapse(win, Window::DimZ);
- return std::make_pair(Status{}, win_collapsed);
-}
-} // namespace
-
-CLWinogradFilterTransformKernel::CLWinogradFilterTransformKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradFilterTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
- auto padding_info = get_padding_info({ input, output });
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
-
- // Create kernel
- std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(input->info()->data_layout()));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Setup output window
- Window window_out;
- window_out.use_tensor_dimensions(_output->info()->tensor_shape(), 0);
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, window);
- add_3D_tensor_argument(idx, _output, window_out);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.h b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
deleted file mode 100644
index d22fedebcd..0000000000
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd filter transform kernel. */
-class CLWinogradFilterTransformKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLWinogradFilterTransformKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradFilterTransformKernel(const CLWinogradFilterTransformKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradFilterTransformKernel &operator=(const CLWinogradFilterTransformKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLWinogradFilterTransformKernel(CLWinogradFilterTransformKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLWinogradFilterTransformKernel &operator=(CLWinogradFilterTransformKernel &&) = default;
- /** Default destructor */
- ~CLWinogradFilterTransformKernel() = default;
- /** Set the input and output tensor.
- *
- * @note Winograd filter transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd filter transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- */
- void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
- /** Set the input and output tensor.
- *
- * @note Winograd filter transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd filter transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
- *
- * @note Winograd filter transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd filter transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
deleted file mode 100644
index 392edda615..0000000000
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
-
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_UNUSED(output_tile_size);
- ARM_COMPUTE_UNUSED(kernel_size);
-
- // Validate configured output
- if(output->total_size() != 0)
- {
- const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- bool window_changed = false;
- Window win = calculate_max_window(*input, Steps(1, 1));
-
- if(input->data_layout() == DataLayout::NCHW)
- {
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
-
- unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
- unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-
- AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
- window_changed = update_window_and_padding(win, input_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
- : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
-{
-}
-
-BorderSize CLWinogradInputTransformKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
-
- auto padding_info = get_padding_info({ input, output });
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
-
- _data_layout = input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input->info()->dimension(idx_w), input->info()->dimension(idx_h)),
- kernel_size,
- output_tile_size,
- conv_info);
-
- _input = input;
- _output = output;
- _num_tiles_x = num_tiles.width;
- _num_tiles_y = num_tiles.height;
-
- const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
- const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
- build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
- if(_data_layout == DataLayout::NHWC)
- {
- build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
- build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- }
- else
- {
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
- }
-
- // Create kernel
- std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
-
- // Get the maximum dimension from the tile size
- const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
-
- // Check optimized kernel if output_dims == 2x2
- if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
- {
- _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
- }
-
- // Append stepz and data layout
- kernel_name += "_stepz";
- kernel_name += support::cpp11::to_string(_step_z);
- kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
-
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Create window and update padding
- auto win_config = validate_and_configure_window(input->info(), output->info(), winograd_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
- _border_size = BorderSize(_input->info()->padding());
-
- ARM_COMPUTE_ERROR_ON((input->info()->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
-
- _config_id = kernel_name;
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_info.pad_left());
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_info.pad_top());
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), winograd_info).first);
-
- return Status{};
-}
-
-void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const size_t total_batches = window.shape().total_size_upper(3);
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- Window slice = window_collapsed.first_slice_window_3D();
- slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
- slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
- if(_data_layout == DataLayout::NHWC)
- {
- slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
- }
-
- ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
- slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
-
- unsigned int idx = 2 * num_arguments_per_3D_tensor();
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.h b/src/core/CL/kernels/CLWinogradInputTransformKernel.h
deleted file mode 100644
index 25301877e6..0000000000
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Winograd input transform.*/
-class CLWinogradInputTransformKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLWinogradInputTransformKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradInputTransformKernel(const CLWinogradInputTransformKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradInputTransformKernel &operator=(const CLWinogradInputTransformKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLWinogradInputTransformKernel(CLWinogradInputTransformKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLWinogradInputTransformKernel &operator=(CLWinogradInputTransformKernel &&) = default;
- /** Set the input and output of the kernel.
- *
- * @note Winograd input transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd input transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input The input tensor to transform. Data types supported: F16/F32
- * @param[in] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
- /** Set the input and output of the kernel.
- *
- * @note Winograd input transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd input transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to transform. Data types supported: F16/F32
- * @param[in] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
- *
- * @note Winograd input transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd input transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input The input tensor to transform. Data types supported: F16/F32
- * @param[in] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
- BorderSize _border_size;
- const ICLTensor *_input;
- ICLTensor *_output;
- DataLayout _data_layout;
- int _num_tiles_x;
- int _num_tiles_y;
- unsigned int _step_z;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
deleted file mode 100644
index 2018559f60..0000000000
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D input_dimensions = winograd_info.input_dimensions;
- const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
-
- // Compute number of elements to process in the X and Y direction
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
- kernel_size,
- output_tile_size,
- conv_info);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(bias);
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
-
- if(output->data_layout() == DataLayout::NCHW)
- {
- const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
- const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr), _is_nhwc(false)
-{
-}
-
-void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, winograd_info, act_info);
-}
-
-void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
-
- auto padding_info = get_padding_info({ input, bias, output });
-
- _input = input;
- _bias = bias;
- _output = output;
- _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
-
- // Compute num_tiles_x
- const Size2D input_dimensions = winograd_info.input_dimensions;
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
- kernel_size,
- output_tile_size,
- conv_info);
- const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
- if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
- {
- build_opts.add_option("-DVEC_SIZE=2");
- }
- else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
- {
- build_opts.add_option("-DVEC_SIZE=4");
- }
-
- build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
- build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
- build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
- build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(idx_width)));
- build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(idx_height)));
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
-
- // Create kernel
- std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info.output_tile_size);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
-
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
-}
-
-Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
-
- return Status{};
-}
-
-void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Get initial windows
- Window slice = window_collapsed.first_slice_window_4D();
- slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- // Setup output slice
- Window slice_out(slice);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- if(_bias != nullptr)
- {
- unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
- add_1D_tensor_argument(idx1, _bias, slice_biases);
- }
-
- if(_is_nhwc)
- {
- unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
- _kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
- }
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.h b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
deleted file mode 100644
index 632a5629d9..0000000000
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd output transform kernel. */
-class CLWinogradOutputTransformKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLWinogradOutputTransformKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
- /** Default destructor */
- ~CLWinogradOutputTransformKernel() = default;
- /** Set the input and output tensor.
- *
- * @note Winograd output transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd output transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
- * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Set the input and output tensor.
- *
- * @note Winograd output transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd output transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
- * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
- *
- * @note Winograd output transform supports the following configurations for NCWH data layout
- * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
- * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * @note Winograd output transform supports the following configurations for NHWC data layout
- * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
- * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
- *
- * Strides: only unit strides
- *
- * @param[in] input Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
- * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
- * @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
- * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation @ref ActivationLayerInfo. Only RELU, BOUNDED_RELU, LU_BOUNDED_RELU, LEAKY_RELU and SOFT_RELU supported.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
- const ICLTensor *_input;
- const ICLTensor *_bias;
- ICLTensor *_output;
- bool _is_nhwc;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 4c92ae417f..0000000000
--- a/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
-{
-public:
- /** Default constructor */
- ICLDepthwiseConvolutionLayer3x3Kernel()
- : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
- {
- }
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- ICLDepthwiseConvolutionLayer3x3Kernel(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- ICLDepthwiseConvolutionLayer3x3Kernel &operator=(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
- /** Default Move Constructor. */
- ICLDepthwiseConvolutionLayer3x3Kernel(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
- /** Default move assignment operator */
- ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
- * Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
- * Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8.
- * @param[out] output Destination tensor. Data type supported: Same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
- * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
- */
- virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
- const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-
-protected:
- BorderSize _border_size;
- const ICLTensor *_input;
- ICLTensor *_output;
- const ICLTensor *_weights;
- const ICLTensor *_biases;
- unsigned int _conv_stride_y;
- const ICLTensor *_output_multipliers;
- const ICLTensor *_output_shifts;
- bool _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H */