aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r--src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp107
-rw-r--r--src/core/CL/kernels/CLAccumulateKernel.cpp101
-rw-r--r--src/core/CL/kernels/CLActivationLayerKernel.cpp283
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp194
-rw-r--r--src/core/CL/kernels/CLArgMinMaxLayerKernel.h105
-rw-r--r--src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp174
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp200
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.h139
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp131
-rw-r--r--src/core/CL/kernels/CLBatchToSpaceLayerKernel.h137
-rw-r--r--src/core/CL/kernels/CLBitwiseAndKernel.cpp93
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.cpp122
-rw-r--r--src/core/CL/kernels/CLBitwiseKernel.h77
-rw-r--r--src/core/CL/kernels/CLBitwiseNotKernel.cpp53
-rw-r--r--src/core/CL/kernels/CLBitwiseOrKernel.cpp94
-rw-r--r--src/core/CL/kernels/CLBitwiseXorKernel.cpp94
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp55
-rw-r--r--src/core/CL/kernels/CLBoundingBoxTransformKernel.h109
-rw-r--r--src/core/CL/kernels/CLBox3x3Kernel.cpp82
-rw-r--r--src/core/CL/kernels/CLCannyEdgeKernel.cpp309
-rw-r--r--src/core/CL/kernels/CLChannelCombineKernel.cpp298
-rw-r--r--src/core/CL/kernels/CLChannelExtractKernel.cpp197
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp104
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.h85
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp176
-rw-r--r--src/core/CL/kernels/CLColorConvertKernel.cpp560
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.cpp117
-rw-r--r--src/core/CL/kernels/CLComparisonKernel.h96
-rw-r--r--src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp116
-rw-r--r--src/core/CL/kernels/CLConvolutionKernel.cpp392
-rw-r--r--src/core/CL/kernels/CLCopyKernel.cpp286
-rw-r--r--src/core/CL/kernels/CLCropKernel.cpp139
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp37
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h89
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp93
-rw-r--r--src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h121
-rw-r--r--src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp158
-rw-r--r--src/core/CL/kernels/CLDepthConvertLayerKernel.cpp142
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp41
-rw-r--r--src/core/CL/kernels/CLDepthToSpaceLayerKernel.h86
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp428
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp468
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp498
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h131
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp133
-rw-r--r--src/core/CL/kernels/CLDequantizationLayerKernel.cpp164
-rw-r--r--src/core/CL/kernels/CLDerivativeKernel.cpp157
-rw-r--r--src/core/CL/kernels/CLDilateKernel.cpp70
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp638
-rw-r--r--src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp141
-rw-r--r--src/core/CL/kernels/CLElementwiseOperationKernel.cpp463
-rw-r--r--src/core/CL/kernels/CLErodeKernel.cpp70
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.cpp59
-rw-r--r--src/core/CL/kernels/CLFFTDigitReverseKernel.h97
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.cpp66
-rw-r--r--src/core/CL/kernels/CLFFTRadixStageKernel.h100
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.cpp72
-rw-r--r--src/core/CL/kernels/CLFFTScaleKernel.h89
-rw-r--r--src/core/CL/kernels/CLFastCornersKernel.cpp211
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp116
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.h108
-rw-r--r--src/core/CL/kernels/CLFlattenLayerKernel.cpp140
-rw-r--r--src/core/CL/kernels/CLFloorKernel.cpp137
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp140
-rw-r--r--src/core/CL/kernels/CLFuseBatchNormalizationKernel.h147
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp336
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp320
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp577
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp260
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp329
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp183
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp177
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp182
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp183
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp183
-rw-r--r--src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp227
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp142
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp548
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp417
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp401
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp419
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp179
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp215
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp194
-rw-r--r--src/core/CL/kernels/CLGatherKernel.cpp54
-rw-r--r--src/core/CL/kernels/CLGatherKernel.h95
-rw-r--r--src/core/CL/kernels/CLGaussian3x3Kernel.cpp81
-rw-r--r--src/core/CL/kernels/CLGaussian5x5Kernel.cpp55
-rw-r--r--src/core/CL/kernels/CLGaussianPyramidKernel.cpp246
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp46
-rw-r--r--src/core/CL/kernels/CLGenerateProposalsLayerKernel.h88
-rw-r--r--src/core/CL/kernels/CLHOGDescriptorKernel.cpp239
-rw-r--r--src/core/CL/kernels/CLHOGDetectorKernel.cpp148
-rw-r--r--src/core/CL/kernels/CLHarrisCornersKernel.cpp150
-rw-r--r--src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp148
-rw-r--r--src/core/CL/kernels/CLHistogramKernel.cpp255
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp426
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp159
-rw-r--r--src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h134
-rw-r--r--src/core/CL/kernels/CLIntegralImageKernel.cpp147
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp98
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.h107
-rw-r--r--src/core/CL/kernels/CLLKTrackerKernel.cpp315
-rw-r--r--src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp154
-rw-r--r--src/core/CL/kernels/CLMagnitudePhaseKernel.cpp177
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp157
-rw-r--r--src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h93
-rw-r--r--src/core/CL/kernels/CLMeanStdDevKernel.cpp158
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp64
-rw-r--r--src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h93
-rw-r--r--src/core/CL/kernels/CLMedian3x3Kernel.cpp88
-rw-r--r--src/core/CL/kernels/CLMemsetKernel.cpp112
-rw-r--r--src/core/CL/kernels/CLMinMaxLayerKernel.cpp169
-rw-r--r--src/core/CL/kernels/CLMinMaxLocationKernel.cpp245
-rw-r--r--src/core/CL/kernels/CLNonLinearFilterKernel.cpp105
-rw-r--r--src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp77
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.cpp174
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.h93
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp105
-rw-r--r--src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h99
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.cpp181
-rw-r--r--src/core/CL/kernels/CLPadLayerKernel.h106
-rw-r--r--src/core/CL/kernels/CLPermuteKernel.cpp141
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp467
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp410
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.cpp91
-rw-r--r--src/core/CL/kernels/CLPriorBoxLayerKernel.h114
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp56
-rw-r--r--src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h93
-rw-r--r--src/core/CL/kernels/CLQuantizationLayerKernel.cpp190
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.cpp98
-rw-r--r--src/core/CL/kernels/CLROIAlignLayerKernel.h116
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.cpp156
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.h112
-rw-r--r--src/core/CL/kernels/CLRangeKernel.cpp101
-rw-r--r--src/core/CL/kernels/CLRangeKernel.h93
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp342
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.h100
-rw-r--r--src/core/CL/kernels/CLRemapKernel.cpp117
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.cpp52
-rw-r--r--src/core/CL/kernels/CLReorgLayerKernel.h91
-rw-r--r--src/core/CL/kernels/CLReshapeLayerKernel.cpp130
-rw-r--r--src/core/CL/kernels/CLReverseKernel.cpp56
-rw-r--r--src/core/CL/kernels/CLReverseKernel.h98
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp307
-rw-r--r--src/core/CL/kernels/CLScharr3x3Kernel.cpp129
-rw-r--r--src/core/CL/kernels/CLSelectKernel.cpp101
-rw-r--r--src/core/CL/kernels/CLSelectKernel.h91
-rw-r--r--src/core/CL/kernels/CLSobel3x3Kernel.cpp143
-rw-r--r--src/core/CL/kernels/CLSobel5x5Kernel.cpp253
-rw-r--r--src/core/CL/kernels/CLSobel7x7Kernel.cpp257
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp426
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp129
-rw-r--r--src/core/CL/kernels/CLSpaceToBatchLayerKernel.h145
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp35
-rw-r--r--src/core/CL/kernels/CLSpaceToDepthLayerKernel.h86
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.cpp51
-rw-r--r--src/core/CL/kernels/CLStackLayerKernel.h112
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.cpp167
-rw-r--r--src/core/CL/kernels/CLStridedSliceKernel.h94
-rw-r--r--src/core/CL/kernels/CLTableLookupKernel.cpp68
-rw-r--r--src/core/CL/kernels/CLThresholdKernel.cpp82
-rw-r--r--src/core/CL/kernels/CLTileKernel.cpp52
-rw-r--r--src/core/CL/kernels/CLTileKernel.h91
-rw-r--r--src/core/CL/kernels/CLTransposeKernel.cpp138
-rw-r--r--src/core/CL/kernels/CLUpsampleLayerKernel.cpp171
-rw-r--r--src/core/CL/kernels/CLWarpAffineKernel.cpp132
-rw-r--r--src/core/CL/kernels/CLWarpPerspectiveKernel.cpp104
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp162
-rw-r--r--src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp185
-rw-r--r--src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp237
-rw-r--r--src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp147
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp159
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp270
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp268
-rw-r--r--src/core/CL/kernels/CLYOLOLayerKernel.cpp186
176 files changed, 6839 insertions, 23111 deletions
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
deleted file mode 100644
index 5b03fb56e7..0000000000
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifferenceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
- "The output image can only be U8 if both input images are U8");
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "absdiff", build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input1_access, input2_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
deleted file mode 100644
index a7dfcdca9b..0000000000
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "accumulate");
-
- // Make sure _kernel is initialized before calling the parent's configure
- ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeightedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "accumulate_weighted");
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, alpha);
-
- // Configure kernel window
- ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquaredKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON(shift > 15);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "accumulate_squared");
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, shift);
-
- // Configure kernel window
- ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
deleted file mode 100644
index d40e9a15be..0000000000
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-
-#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <set>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
-
- static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
- {
- ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH
- };
- const DataType data_type = input->data_type();
- const QuantizationInfo &oq_info = (output != nullptr) ? output->quantization_info() : input->quantization_info();
- const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
- "For Quantized data type only tanh, logistic, relu and lower/upper bounded relu are supported");
-
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
- // Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- if(output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
-
- if(output != nullptr)
- {
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
- else
- {
- window_changed = update_window_and_padding(win,
- AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLActivationLayerKernel::CLActivationLayerKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
-{
-}
-
-void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
-}
-
-void CLActivationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
- _run_in_place = (output == nullptr) || (output == input);
-
- if(output != nullptr)
- {
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(),
- *input->info()->clone());
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const DataType dt = input->info()->data_type();
- float a_const = act_info.a();
- float b_const = act_info.b();
-
- const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
- const bool is_quantized = is_data_type_quantized(dt);
- const bool perform_activation_in_float =
- (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) || (f_act == ActivationLayerInfo::ActivationFunction::TANH) || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
- build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
- build_opts.add_option(("-DACT=" + lower_string(string_from_activation_func(f_act))));
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
- build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- std::string kernel_name = std::string("activation_layer");
-
- // Set quantization info build options
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-
- if(!perform_activation_in_float)
- {
- int a_const_int = 0;
- int b_const_int = 0;
-
- // Create quantized version of constants a, b if needed
- switch(dt)
- {
- case DataType::QASYMM8:
- {
- a_const_int = quantize_qasymm8(a_const, iq_info);
- b_const_int = quantize_qasymm8(b_const, iq_info);
- }
- break;
- case DataType::QASYMM8_SIGNED:
- {
- a_const_int = quantize_qasymm8_signed(a_const, iq_info);
- b_const_int = quantize_qasymm8_signed(b_const, iq_info);
- }
- break;
- case DataType::QSYMM16:
- {
- a_const_int = quantize_qsymm16(a_const, iq_info);
- b_const_int = quantize_qsymm16(b_const, iq_info);
- }
- break;
- default:
- break;
- }
- build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
- build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
- }
- else
- {
- build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
- build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
- }
-
- // Quantized value of 0 corresponds to the offset o1
- build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
- build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
-
- // Set correct kernel name
- kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
-
- // Set scale and offset of the input and output if they have different quantization info
- if(output != nullptr)
- {
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- if(iq_info != oq_info)
- {
- build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
- }
- }
- }
- else
- {
- // Set A, B constants in build options for float types
- build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
- build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Make sure _kernel is initialized before calling the parent's configure
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "activation_layer_";
- _config_id += lower_string(string_from_data_type(dt));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- const bool run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
-
- return Status{};
-}
-
-void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
- {
- add_3D_tensor_argument(idx, _output, slice);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index b86e43e6fb..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,141 +21,115 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-constexpr unsigned int vector_size = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
- }
- if(prev_output != nullptr && prev_output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, DataType::S32);
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+ DataType::U64);
}
return Status{};
}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *prev_output, ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
- ARM_COMPUTE_UNUSED(op);
- // Output tensor auto initialization if not yet initialized
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(axis, 1);
- DataType output_data_type = DataType::S32;
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
- Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size));
- bool window_changed = false;
-
- switch(axis)
- {
- case 0:
- {
- ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
- AccessWindowStatic input_access(input_tensor_access, 0, 0, static_cast<int>(input_tensor_access->dimension(0)), 1);
- AccessWindowHorizontal output_access(output, 0, 1);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- break;
- case 1:
- case 2:
- case 3:
- {
- AccessWindowHorizontal input_access(input, 0, vector_size);
- AccessWindowHorizontal output_access(output, 0, vector_size);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
-}
} // namespace
CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
- : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, prev_output, output, axis, op);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, op));
- auto win_config = validate_and_configure_window(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, op);
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ TensorShape output_shape{input->info()->tensor_shape()};
+ output_shape.set(axis, 1);
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(DataType::S32)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ auto padding_info = get_padding_info({input, output});
_input = input;
- _prev_output = prev_output;
_output = output;
_reduction_axis = axis;
_op = op;
// Set build options
- CLBuildOptions build_opts;
+ const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
+ const auto vector_size = (adjusted_vector_size == 3U && axis == 0U)
+ ? 2U
+ : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
- build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
+ CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
build_opts.add_option("-DDATA_TYPE_OUTPUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_SELECT=" + get_cl_signed_type_from_element_size(input->info()->element_size()));
+ build_opts.add_option("-DCOND_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DUNROLL_WITH_PRAGMA=1");
// Create kernel
- cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
- switch(axis)
+ switch (axis)
{
case 0:
- {
- const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
kernel_axis_name = "x";
- lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), vector_size);
- }
- break;
+ break;
case 1:
build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
kernel_axis_name = "y";
@@ -175,13 +149,18 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
_kernel = create_kernel(compile_context, "arg_min_max_" + kernel_axis_name, build_opts.options());
// Configure kernel window
- ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+ Window win = calculate_max_window(*input->info(), Steps(vector_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, output->clone().get(), axis, op)));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
}
@@ -190,43 +169,36 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
// Set out window
Window out_window(window);
+ Window in_window(window);
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+ in_window.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
// Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
+ Window in_slice = in_window.first_slice_window_2D();
Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
- // Set local sums buffer
- unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
do
{
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input, in_slice);
- if(_prev_output != nullptr)
- {
- add_2D_tensor_argument(idx, _prev_output, in_slice);
- }
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
break;
case 1:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ Window window_in{window};
+ window_in.set(Window::DimY,
+ Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
Window in_slice = window_in.first_slice_window_2D();
Window out_slice = window.first_slice_window_2D();
@@ -236,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_2D_tensor_argument(idx, _input, in_slice);
add_2D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
break;
case 2:
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+ Window window_in{window};
+ window_in.set(Window::DimZ,
+ Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
Window in_slice = window_in.first_slice_window_3D();
Window out_slice = window.first_slice_window_3D();
@@ -254,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
}
break;
case 3:
{
// Get first input and output slices
- Window window_in{ window };
+ Window window_in{window};
window_in.set(3, Window::Dimension(0, 1, 1));
Window in_slice = window_in.first_slice_window_4D();
Window out_slice = window.first_slice_window_4D();
@@ -272,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, in_slice);
add_4D_tensor_argument(idx, _output, out_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
new file mode 100644
index 0000000000..fb3b41b0de
--- /dev/null
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
+#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ *
+ * @note The default data type for an uninitialized output tensor is
+ * signed 32-bit integer (S32). It is the user's responsibility to check
+ * that the results do not overflow because the indices are computed
+ * in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLArgMinMaxLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArgMinMaxLayerKernel(const CLArgMinMaxLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArgMinMaxLayerKernel &operator=(const CLArgMinMaxLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLArgMinMaxLayerKernel(CLArgMinMaxLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLArgMinMaxLayerKernel &operator=(CLArgMinMaxLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLArgMinMaxLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[out] output Destination tensor. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[out] output Destination tensor. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] output Destination tensor info. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ unsigned int _reduction_axis;
+ ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
deleted file mode 100644
index 2182019a40..0000000000
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/StringSupport.h"
-
-#include <map>
-
-using namespace arm_compute;
-
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(batch_offset);
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- // The window needs to be based on output, except for the batch size
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- // The total batch size is the concatenation of the batch size of the inputs
- win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimZ) != output->dimension(Window::DimZ));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) + batch_offset > output->dimension(3));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, input, output);
-
- return Status{};
-}
-} // namespace
-
-CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _batch_offset(0)
-{
-}
-
-void CLBatchConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, batch_offset, output);
-}
-
-void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info()));
-
- _input = input;
- _output = output;
- _batch_offset = batch_offset;
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), batch_offset, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- // Set config_id for enabling LWS tuning
- _config_id = "concatenate_";
- _config_id += support::cpp11::to_string(3);
- _config_id += "_";
- _config_id += support::cpp11::to_string(batch_offset);
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(3));
-}
-
-Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
- unsigned int batch_offset,
- const arm_compute::ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), batch_offset, output->clone().get()).first);
- return Status{};
-}
-
-void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- const int offset_to_first_elements_in_bytes = _batch_offset * _output->info()->strides_in_bytes()[3];
-
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 9db175d922..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,56 +21,66 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
- if(beta != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ if (beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
}
- if(gamma != nullptr)
+ if (gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
}
- if(act_info.enabled())
+ if (act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+ act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ act !=
+ ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -80,23 +90,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- if(output != nullptr)
- {
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->element_size(), input->dimension(0));
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
bool window_changed = false;
- if(output != nullptr)
+ if (output != nullptr)
{
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
window_changed = update_window_and_padding(win, input_access, output_access);
@@ -107,54 +111,57 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
window_changed = update_window_and_padding(win, input_access);
}
- // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
- if(input->data_layout() == DataLayout::NHWC)
- {
- AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
-
- if(beta != nullptr)
- {
- AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, beta_access);
- }
- if(gamma != nullptr)
- {
- AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, gamma_access);
- }
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
- : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+ : _input(nullptr),
+ _output(nullptr),
+ _mean(nullptr),
+ _var(nullptr),
+ _beta(nullptr),
+ _gamma(nullptr),
+ _epsilon(0),
+ _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
}
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
- const ICLTensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
- _input = input;
- _output = output;
- _mean = mean;
- _var = var;
- _beta = beta;
- _gamma = gamma;
- _epsilon = epsilon;
+ auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
+ _input = input;
+ _output = output;
+ _mean = mean;
+ _var = var;
+ _beta = beta;
+ _gamma = gamma;
+ _epsilon = epsilon;
_run_in_place = (output == nullptr) || (output == input);
@@ -162,12 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
(gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -176,28 +186,45 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
// Create kernel
- _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel =
+ create_kernel(compile_context,
+ "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Set kernel static arguments
unsigned int include_output = (!_run_in_place) ? 1 : 0;
- unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
- if(_beta != nullptr)
+ unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() +
+ 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ if (_beta != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip beta parameter
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
}
_kernel.setArg<cl_float>(idx++, _epsilon);
+ if (output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
- mean->info(), var->info(),
- (beta != nullptr) ? beta->info() : nullptr,
- (gamma != nullptr) ? gamma->info() : nullptr);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+ }
+ else
+ {
+ auto win_config = validate_and_configure_window_nchw(input->info(), (_run_in_place) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+ }
+
+ ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
_config_id = "batch_normalization_layer_";
_config_id += string_from_data_type(input->info()->data_type());
@@ -211,18 +238,24 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
_config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
}
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
- mean->clone().get(), var->clone().get(),
- (beta != nullptr) ? beta->clone().get() : nullptr,
- (gamma != nullptr) ? gamma->clone().get() : nullptr)
- .first);
+
+ if (input->data_layout() != DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+ .first);
+ }
return Status{};
}
@@ -241,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor();
add_1D_tensor_argument(idx, _mean, vector_slice);
add_1D_tensor_argument(idx, _var, vector_slice);
- if(_beta != nullptr)
+ if (_beta != nullptr)
{
add_1D_tensor_argument(idx, _beta, vector_slice);
}
- if(_gamma != nullptr)
+ if (_gamma != nullptr)
{
add_1D_tensor_argument(idx, _gamma, vector_slice);
}
@@ -254,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
{
idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000000..1a88d2a8c5
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class CLBatchNormalizationLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLBatchNormalizationLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
+ /** Default move assignment operator */
+ CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLBatchNormalizationLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+ *
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
+ * 3 lower dimensions represent a single input with dimensions [width, height, FM].
+ * The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] beta (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+ * @param[in] gamma (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+ * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ */
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
+ /** Set the input and output tensors.
+ *
+ * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
+ * 3 lower dimensions represent a single input with dimensions [width, height, FM].
+ * The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] beta (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+ * @param[in] gamma (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+ * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
+ *
+ * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
+ * 3 lower dimensions represent a single input with dimensions [width, height, FM].
+ * The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+ * @param[in] beta (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
+ * @param[in] gamma (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
+ * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_mean;
+ const ICLTensor *_var;
+ const ICLTensor *_beta;
+ const ICLTensor *_gamma;
+ float _epsilon;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index e899be9317..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,17 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -50,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -62,14 +71,12 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+ const TensorShape expected_output_shape = compute_batch_to_space_shape(
+ input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+ const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -78,9 +85,9 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
}
} // namespace
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
@@ -88,73 +95,98 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ auto padding_info = get_padding_info({input, block_shape, output});
+
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
_input = input;
_block_shape = block_shape;
_output = output;
- const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
- build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
+ _kernel = create_kernel(compile_context,
+ "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
}
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ const TensorShape output_shape = compute_batch_to_space_shape(
+ input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
_input = input;
_output = output;
- const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
- build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
- build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
+ build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
+ _kernel = create_kernel(
+ compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
return Status{};
}
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
return Status{};
}
@@ -163,32 +195,31 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice_in = window.first_slice_window_3D();
- Window slice_out = window.first_slice_window_4D();
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_in = window.first_slice_window_4D();
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
int batch_id = 0;
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _input, slice_in);
add_argument(idx, batch_id);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
add_1D_tensor_argument(idx, _block_shape, vector_slice);
}
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in, lws_hint());
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_in));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
new file mode 100644
index 0000000000..b9d3e66fe2
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the batch to space kernel */
+class CLBatchToSpaceLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLBatchToSpaceLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchToSpaceLayerKernel(const CLBatchToSpaceLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchToSpaceLayerKernel &operator=(const CLBatchToSpaceLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBatchToSpaceLayerKernel(CLBatchToSpaceLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBatchToSpaceLayerKernel &operator=(CLBatchToSpaceLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLBatchToSpaceLayerKernel() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output);
+ /** Initialise the kernel's inputs and output (Static block shape).
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
+ */
+ void configure(const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
+ /** Initialise the kernel's inputs and output (Static block shape).
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+ * @param[in] output Tensor output. Data types supported: same as @p input
+ *
+ * @return a status
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const int32_t block_shape_x,
+ const int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ const ICLTensor *_block_shape; /**< Block shape tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
deleted file mode 100644
index 45622aa319..0000000000
--- a/src/core/CL/kernels/CLBitwiseAndKernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLBitwiseAndKernel::CLBitwiseAndKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLBitwiseAndKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "bitwise_and");
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input1_access, input2_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
new file mode 100644
index 0000000000..de3fb43de8
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ if (op != BitwiseOperation::NOT)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ }
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*(output->info()), *(input1->info()));
+ auto padding_info = get_padding_info({input1, input2, output});
+
+ // Configure kernel window
+ const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
+ Window win = calculate_max_window(*output->info(), Steps(vec_size_x));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "";
+ switch (op)
+ {
+ case BitwiseOperation::AND:
+ kernel_name = "bitwise_and";
+ break;
+ case BitwiseOperation::NOT:
+ kernel_name = "bitwise_not";
+ break;
+ case BitwiseOperation::OR:
+ kernel_name = "bitwise_or";
+ break;
+ case BitwiseOperation::XOR:
+ kernel_name = "bitwise_xor";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Bitwise operation not supported");
+ }
+
+ CLBuildOptions build_opts;
+ const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ if (_input2 != nullptr)
+ {
+ add_2D_tensor_argument(idx, _input2, slice);
+ }
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_2D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
new file mode 100644
index 0000000000..2c74955ae4
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBITWISEKERNEL_H
+#define ARM_COMPUTE_CLBITWISEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise operation kernel.
+ *
+ * Result depends on the \ref BitwiseOperation and is computed by:
+ * AND operation: @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ * NOT operation: @f[ output(x,y) = \lnot input1(x,y) @f]
+ * OR operation: @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ * XOR operation: @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class CLBitwiseKernel : public ICLKernel
+{
+public:
+ /** Default constructor. */
+ CLBitwiseKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBitwiseKernel(const CLBitwiseKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBitwiseKernel &operator=(const CLBitwiseKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBitwiseKernel(CLBitwiseKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBitwiseKernel &operator=(CLBitwiseKernel &&) = default;
+ /** Set the inputs and output tensors
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input1 Source tensor. Data types supported: U8.
+ * @param[in] input2 Source tensor. Data types supported: U8.
+ * @param[out] output Destination tensor. Data types supported: U8.
+ * @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ BitwiseOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1; /**< Source tensor 1 */
+ const ICLTensor *_input2; /**< Source tensor 2 */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLBITWISEKERNEL_H */
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
deleted file mode 100644
index 0ad20a1897..0000000000
--- a/src/core/CL/kernels/CLBitwiseNotKernel.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLBitwiseNotKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input = input;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "bitwise_not");
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
deleted file mode 100644
index a911dd9e01..0000000000
--- a/src/core/CL/kernels/CLBitwiseOrKernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLBitwiseOrKernel::CLBitwiseOrKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLBitwiseOrKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "bitwise_or");
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input1_access, input2_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
deleted file mode 100644
index 084991a202..0000000000
--- a/src/core/CL/kernels/CLBitwiseXorKernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLBitwiseXorKernel::CLBitwiseXorKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLBitwiseXorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "bitwise_xor");
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input1_access, input2_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 55c6f8453b..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,26 +21,30 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
}
- if(pred_boxes->total_size() > 0)
+ if (pred_boxes->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
- if(is_qasymm16)
+ if (is_qasymm16)
{
const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -83,20 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
}
} // namespace
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
- : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
}
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
- auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+ auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+ auto_init_if_empty(*pred_boxes->info(), deltas->info()
+ ->clone()
+ ->set_data_type(boxes->info()->data_type())
+ .set_quantization_info(boxes->info()->quantization_info()));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
@@ -126,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
- if(is_quantized)
+ if (is_quantized)
{
build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform();
@@ -146,11 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
// Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
const unsigned int num_elems_processed_per_iteration = 4;
- Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
new file mode 100644
index 0000000000..9a1bb49bb9
--- /dev/null
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
+#define ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bounding box kernel */
+class CLBoundingBoxTransformKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLBoundingBoxTransformKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBoundingBoxTransformKernel(const CLBoundingBoxTransformKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBoundingBoxTransformKernel &operator=(const CLBoundingBoxTransformKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBoundingBoxTransformKernel(CLBoundingBoxTransformKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBoundingBoxTransformKernel &operator=(CLBoundingBoxTransformKernel &&) = default;
+ /** Default destructor */
+ ~CLBoundingBoxTransformKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] boxes Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+ * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+ * @param[in] deltas Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K is the number of classes.
+ * Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+ * @param[in] info Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+ *
+ * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+ *
+ */
+ void configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] boxes Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+ * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+ * @param[in] deltas Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K is the number of classes.
+ * Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+ * @param[in] info Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+ *
+ * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
+ *
+ * @param[in] boxes Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
+ * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
+ * @param[in] deltas Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K is the number of classes.
+ * Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
+ * @param[in] info Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
+ *
+ * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
+ *
+ * @return a Status
+ */
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_boxes;
+ ICLTensor *_pred_boxes;
+ const ICLTensor *_deltas;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
deleted file mode 100644
index d665845ed7..0000000000
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLBox3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLBox3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input = input;
- _output = output;
-
- // Set build options
- std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
- "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
- "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
- "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
- };
-
- // Create kernel
- _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
deleted file mode 100644
index 95b03970b0..0000000000
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGradientKernel::CLGradientKernel()
- : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
- configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, norm_type);
-}
-
-void CLGradientKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
- "Gx and Gy must have the same pixel size");
- ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
- "Mag must have the same pixel size as Gx and Gy");
-
- _gx = gx;
- _gy = gy;
- _magnitude = magnitude;
- _phase = phase;
-
- // Create build opts
- std::set<std::string> built_opts;
- built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
- built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
- // Create kernel
- const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
- _kernel = create_kernel(compile_context, kernel_name, built_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
- mag_access.set_valid_region(win, _gx->info()->valid_region());
- phase_access.set_valid_region(win, _gx->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gx->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _gx, slice);
- add_2D_tensor_argument(idx, _gy, slice);
- add_2D_tensor_argument(idx, _magnitude, slice);
- add_2D_tensor_argument(idx, _phase, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
- : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), magnitude, phase, output, lower_thr, border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
-
- _magnitude = magnitude;
- _phase = phase;
- _output = output;
-
- // Create build opts
- std::set<std::string> built_opts;
- built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
- built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- const std::string kernel_name = std::string("suppress_non_maximum");
- _kernel = create_kernel(compile_context, kernel_name, built_opts);
-
- // Set minimum threshold argument
- unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, lower_thr);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- constexpr unsigned int num_elems_read_written_per_iteration = 3;
-
- Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
- num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
- AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, mag_access, phase_access, output_access);
-
- output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(output->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _magnitude, slice);
- add_2D_tensor_argument(idx, _phase, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLEdgeTraceKernel::CLEdgeTraceKernel()
- : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
-{
-}
-
-void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
- ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, visited, recorded, l1_stack, l1_stack_counter);
-}
-
-void CLEdgeTraceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
- ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
-
- _input = input;
- _output = output;
- _lower_thr = lower_thr;
- _upper_thr = upper_thr;
- _visited = visited;
- _recorded = recorded;
- _l1_stack = l1_stack;
- _l1_stack_counter = l1_stack_counter;
-
- // Create build opts
- std::set<std::string> built_opts;
- built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- const std::string kernel_name = std::string("hysteresis");
- _kernel = create_kernel(compile_context, kernel_name, built_opts);
-
- // Set constant kernel args
- unsigned int width = _input->info()->dimension(0);
- unsigned int height = _input->info()->dimension(1);
- unsigned int idx = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
- _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
- _kernel.setArg(idx++, static_cast<cl_uint>(width));
- _kernel.setArg(idx++, static_cast<cl_uint>(height));
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
- output_access,
- visited_access,
- recorded_access,
- l1_stack_access,
- l1_stack_counter_access);
-
- output_access.set_valid_region(win, _input->info()->valid_region());
- visited_access.set_valid_region(win, _input->info()->valid_region());
- recorded_access.set_valid_region(win, _input->info()->valid_region());
- l1_stack_access.set_valid_region(win, _input->info()->valid_region());
- l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_format(output->info()->format()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- add_2D_tensor_argument(idx, _visited, slice);
- add_2D_tensor_argument(idx, _recorded, slice);
- add_2D_tensor_argument(idx, _l1_stack, slice);
- add_2D_tensor_argument(idx, _l1_stack_counter, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
deleted file mode 100644
index 017d98f860..0000000000
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-} // namespace
-
-CLChannelCombineKernel::CLChannelCombineKernel()
- : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
-{
-}
-
-void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
- const Format output_format = output->info()->format();
-
- // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
- if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
- {
- // Validate Y plane of input and output
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
-
- // Validate U and V plane of the input
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
- }
-
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- _planes[3] = nullptr;
-
- // Validate the last input tensor only for RGBA format
- if(Format::RGBA8888 == output_format)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
-
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
-
- _planes[3] = plane3;
- }
-
- _output = output;
- _output_multi = nullptr;
-
- // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
- if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
- {
- _x_subsampling[1] = 2;
- _x_subsampling[2] = 2;
- }
-
- // Create kernel
- std::string kernel_name = "channel_combine_" + string_from_format(output_format);
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Configure window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
- AccessWindowRectangle plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
- AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
- plane1->info()->valid_region(),
- plane2->info()->valid_region());
- if(plane3 != nullptr)
- {
- valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
- }
- output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
- const Format output_format = output->info()->format();
-
- // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
- // Perform validation only for formats which require sub-sampling.
- if(Format::YUV444 != output_format)
- {
- // Validate Y plane of input and output
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
-
- // Validate U and V plane of the input
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-
- // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
- // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
-
- // Validate the last plane V of format IYUV
- if(Format::IYUV == output_format)
- {
- // Validate Y plane of the output
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
- }
- }
-
- // Set input tensors
- _planes[0] = plane0;
- _planes[1] = plane1;
- _planes[2] = plane2;
- _planes[3] = nullptr;
-
- // Set output tensor
- _output = nullptr;
- _output_multi = output;
-
- bool has_two_planars = false;
-
- // Set sub-sampling parameters for each plane
- std::string kernel_name;
- std::set<std::string> build_opts;
-
- if(Format::NV12 == output_format || Format::NV21 == output_format)
- {
- _x_subsampling = { { 1, 2, 2 } };
- _y_subsampling = { { 1, 2, 2 } };
- kernel_name = "channel_combine_NV";
- build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21");
- has_two_planars = true;
- }
- else
- {
- if(Format::IYUV == output_format)
- {
- _x_subsampling = { { 1, 2, 2 } };
- _y_subsampling = { { 1, 2, 2 } };
- }
-
- kernel_name = "copy_planes_3p";
- build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444");
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure window
- Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f);
- AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
- AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
- AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]);
- AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
- AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-
- update_window_and_padding(win,
- input_plane0_access, input_plane1_access, input_plane2_access,
- output_plane0_access, output_plane1_access, output_plane2_access);
-
- ValidRegion plane0_valid_region = plane0->info()->valid_region();
- ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
- output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
- output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
- output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- slice.set_dimension_step(Window::DimY, 1);
-
- do
- {
- // Subsampling in plane 1
- Window win_sub_plane1(slice);
- win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
- win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
-
- // Subsampling in plane 2
- Window win_sub_plane2(slice);
- win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
- win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
-
- unsigned int idx = 0;
-
- // Set inputs
- add_2D_tensor_argument(idx, _planes[0], slice);
- add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
- add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
- add_2D_tensor_argument_if((nullptr != _planes[3]), idx, _planes[3], slice);
-
- // Set outputs
- if(nullptr != _output) // Single planar output
- {
- add_2D_tensor_argument(idx, _output, slice);
- }
- else // Multi-planar output
- {
- // Reduce slice in case of subsampling to avoid out-of bounds access
- slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
-
- add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
- add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
- add_2D_tensor_argument_if((3 == num_planes_from_format(_output_multi->info()->format())), idx, _output_multi->cl_plane(2), win_sub_plane2);
-
- _kernel.setArg(idx++, slice.y().end());
- }
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
deleted file mode 100644
index 669d6c52ad..0000000000
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLChannelExtractKernel::CLChannelExtractKernel()
- : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
-{
-}
-
-void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON(input == output);
-
- set_format_if_unknown(*output->info(), Format::U8);
-
- // Check if input tensor has a valid format
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
- // Check if channel is valid for given format
- const Format format = input->info()->format();
- ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
- // Half the processed elements for U,V channels due to sub-sampling of 2
- _subsampling = 1;
-
- if(format == Format::YUYV422 || format == Format::UYVY422)
- {
- // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
-
- if(channel != Channel::Y)
- {
- _subsampling = 2;
- }
- }
-
- // Calculate output tensor shape using subsampling
- TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
- set_shape_if_empty(*output->info(), output_shape);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- _input = input;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "channel_extract_" + string_from_format(format);
- std::set<std::string> build_opts = { ("-DCHANNEL_" + string_from_channel(channel)) };
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
- AccessWindowRectangle output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
-
- update_window_and_padding(win, input_access, output_access);
-
- ValidRegion input_valid_region = input->info()->valid_region();
- output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
- set_format_if_unknown(*output->info(), Format::U8);
-
- // Check if channel is valid for given format
- const Format format = input->info()->format();
- ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
- // Get input plane from the given channel
- const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(format, channel));
- ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
-
- if(Channel::Y == channel && format != Format::YUV444)
- {
- // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
- ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
- }
-
- // Calculate 2x2 subsampled tensor shape
- TensorShape output_shape = calculate_subsampled_shape(input->cl_plane(0)->info()->tensor_shape(), format, channel);
- set_shape_if_empty(*output->info(), output_shape);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
-
- // Check if input tensor has a valid format
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-
- _output = output;
- _input = input_plane;
- _subsampling = 1;
-
- // Create kernel
- std::string kernel_name;
- std::set<std::string> build_opts;
- if(Channel::Y == channel || Format::IYUV == format || Format::YUV444 == format)
- {
- kernel_name = "copy_plane";
- }
- else
- {
- kernel_name = "channel_extract_" + string_from_format(format);
- build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
- }
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure window
- Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input_plane->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- Window win_sub(slice);
- win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
- win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, win_sub);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index d4eafec4c4..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,16 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -43,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
- const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int channels =
+ input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ num_groups == channels,
+ "Channel shuffling with same number of groups as number of channels would be inefficient");
// There cannot be more groups than channels
ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+ "The number of channels must be a multiple of the number of groups");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -66,28 +74,42 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const unsigned int num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
- constexpr unsigned int num_elems_processed_per_iteration_y = 2;
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ if (is_nhwc)
+ {
+ unsigned int num_elems_processed_per_iteration_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+ return std::make_pair(Status{}, win_collapsed);
+ }
+ else
+ {
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / input->element_size();
+ constexpr unsigned int num_elems_processed_per_iteration_y = 2;
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ // Configure kernel window
+ Window win = calculate_max_window(
+ *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
- const bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
+ const bool window_changed = update_window_and_padding(win, input_access, output_access);
- Window win_collapsed = win.collapse(win, Window::DimZ);
+ Window win_collapsed = win.collapse(win, Window::DimZ);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+ }
}
} // namespace
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
@@ -95,27 +117,42 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
}
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-
const DataLayout data_layout = input->info()->data_layout();
const bool is_nhwc = data_layout == DataLayout::NHWC;
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const unsigned int vec_size = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
+ const unsigned int channels =
+ input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ unsigned int vec_size_x = 0;
+ unsigned int vec_size_x_leftovers = 0;
+ if (is_nhwc)
+ {
+ vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ }
+ else
+ {
+ vec_size_x = max_cl_vector_width / input->info()->element_size();
+ }
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(is_nhwc, "-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+ build_opts.add_option_if(is_nhwc, "-DSRC_DIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
// Create kernel
@@ -146,9 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
_config_id += support::cpp11::to_string(output->info()->dimension(1));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(2));
+ if (data_layout == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
}
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
new file mode 100644
index 0000000000..43c939ebd8
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
+#define ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the channel shuffle kernel */
+class CLChannelShuffleLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLChannelShuffleLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLChannelShuffleLayerKernel(const CLChannelShuffleLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLChannelShuffleLayerKernel &operator=(const CLChannelShuffleLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLChannelShuffleLayerKernel(CLChannelShuffleLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLChannelShuffleLayerKernel &operator=(CLChannelShuffleLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLChannelShuffleLayerKernel() = default;
+ /** Configure function's inputs and outputs.
+ *
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+ /** Configure function's inputs and outputs.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
+ *
+ * @param[in] input Input tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: Same as @p input
+ * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
deleted file mode 100644
index 5adb9ef60d..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
-
- constexpr unsigned int num_elems_read_per_iteration = 8;
-
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
-
- // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
- AccessWindowHorizontal input_access(input, 0, num_elems_read_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access);
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLCol2ImKernel::CLCol2ImKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, convolved_dims, num_groups);
-}
-
-void CLCol2ImKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims, num_groups));
-
- _input = input;
- _output = output;
- _convolved_dims = convolved_dims;
-
- const DataType data_type = input->info()->data_type();
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
- build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
- build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-
- _kernel = create_kernel(compile_context, "col2im", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims, num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "col2im_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(num_groups);
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims, num_groups).first);
- return Status{};
-}
-
-void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- bool is_collapsed = false;
- bool is_collapsed_out = false;
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
- Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
-
- ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_out = collapsed_out.first_slice_window_4D();
- do
- {
- // Set inputs
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
deleted file mode 100644
index e9612f3ea6..0000000000
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <sstream>
-
-using namespace arm_compute;
-
-CLColorConvertKernel::CLColorConvertKernel()
- : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
-{
-}
-
-void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON(input == nullptr);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
-
- unsigned int num_elems_processed_per_iteration = 0;
- switch(input->info()->format())
- {
- case Format::RGBA8888:
- {
- switch(output->info()->format())
- {
- case Format::RGB888:
- num_elems_processed_per_iteration = 16;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- case Format::UYVY422:
- case Format::YUYV422:
- {
- switch(output->info()->format())
- {
- case Format::RGB888:
- case Format::RGBA8888:
- num_elems_processed_per_iteration = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- case Format::RGB888:
- {
- switch(output->info()->format())
- {
- case Format::RGBA8888:
- case Format::U8:
- num_elems_processed_per_iteration = 16;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- default:
- break;
- }
- ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
- string_from_format(input->info()->format()).c_str(),
- string_from_format(output->info()->format()).c_str());
-
- std::stringstream kernel_name;
-
- kernel_name << string_from_format(input->info()->format());
- kernel_name << "_to_";
- kernel_name << string_from_format(output->info()->format());
- kernel_name << "_bt709";
-
- _input = input;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name.str());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
-
- unsigned int num_elems_processed_per_iteration = 0;
-
- switch(input->info()->format())
- {
- case Format::NV12:
- case Format::NV21:
- case Format::IYUV:
- {
- switch(output->info()->format())
- {
- case Format::RGB888:
- case Format::RGBA8888:
- num_elems_processed_per_iteration = 4;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- default:
- break;
- }
- ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
- string_from_format(input->info()->format()).c_str(),
- string_from_format(output->info()->format()).c_str());
-
- std::stringstream kernel_name;
-
- kernel_name << string_from_format(input->info()->format());
- kernel_name << "_to_";
- kernel_name << string_from_format(output->info()->format());
- kernel_name << "_bt709";
-
- _multi_input = input;
- _output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name.str());
-
- // Configure kernel window
- const bool has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
- const float sub_sampling = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- win.set_dimension_step(Window::DimY, 2);
-
- AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
- sub_sampling, sub_sampling);
- AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
- sub_sampling, sub_sampling);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- plane0_access, plane1_access, plane2_access,
- output_access);
-
- ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
- input->plane(2)->info()->valid_region());
- output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
-
- unsigned int num_elems_processed_per_iteration = 0;
- unsigned int num_elems_read_per_iteration_x = 0;
-
- bool has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
- float sub_sampling = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
- switch(input->info()->format())
- {
- case Format::RGB888:
- case Format::RGBA8888:
- {
- switch(output->info()->format())
- {
- case Format::NV12:
- case Format::IYUV:
- num_elems_processed_per_iteration = 2;
- num_elems_read_per_iteration_x = 8;
- break;
- case Format::YUV444:
- num_elems_processed_per_iteration = 4;
- num_elems_read_per_iteration_x = 16;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- case Format::UYVY422:
- case Format::YUYV422:
- {
- switch(output->info()->format())
- {
- case Format::NV12:
- case Format::IYUV:
- num_elems_processed_per_iteration = 8;
- num_elems_read_per_iteration_x = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- default:
- break;
- }
-
- ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
- string_from_format(input->info()->format()).c_str(),
- string_from_format(output->info()->format()).c_str());
-
- std::stringstream kernel_name;
-
- kernel_name << string_from_format(input->info()->format());
- kernel_name << "_to_";
- kernel_name << string_from_format(output->info()->format());
- kernel_name << "_bt709";
- _input = input;
- _multi_output = output;
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name.str());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
- {
- win.set_dimension_step(Window::DimY, 2);
- }
-
- AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
- AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
- num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration_x);
-
- update_window_and_padding(win,
- input_access,
- output_plane0_access,
- output_plane1_access,
- output_plane2_access);
-
- ValidRegion input_region = input->info()->valid_region();
-
- output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
- output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
- output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
- unsigned int num_elems_processed_per_iteration = 0;
- switch(input->info()->format())
- {
- case Format::NV12:
- case Format::NV21:
- {
- switch(output->info()->format())
- {
- case Format::IYUV:
- case Format::YUV444:
- num_elems_processed_per_iteration = 16;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- case Format::IYUV:
- {
- switch(output->info()->format())
- {
- case Format::YUV444:
- case Format::NV12:
- num_elems_processed_per_iteration = 16;
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- break;
- }
- default:
- break;
- }
- ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
- string_from_format(input->info()->format()).c_str(),
- string_from_format(output->info()->format()).c_str());
-
- std::stringstream kernel_name;
-
- kernel_name << string_from_format(input->info()->format());
- kernel_name << "_to_";
- kernel_name << string_from_format(output->info()->format());
- kernel_name << "_bt709";
-
- _multi_input = input;
- _multi_output = output;
-
- // Create kernel
- bool has_two_input_planars = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
- bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
-
- float sub_sampling_input = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
- float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
-
- _kernel = create_kernel(compile_context, kernel_name.str());
-
- Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
- win.set_dimension_step(Window::DimY, 2);
-
- AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
- sub_sampling_input, sub_sampling_input);
- AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
- sub_sampling_input, sub_sampling_input);
- AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
- AccessWindowRectangle output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
- num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
-
- update_window_and_padding(win,
- input_plane0_access, input_plane1_access, input_plane2_access,
- output_plane0_access, output_plane1_access, output_plane2_access);
-
- ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
- input->plane(2)->info()->valid_region());
- output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
- output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
- output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1));
-}
-
-void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- if(nullptr != _input && nullptr != _output)
- {
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- }
- else if(nullptr != _input && nullptr != _multi_output)
- {
- Format format = _multi_output->info()->format();
- do
- {
- Window win_uv(slice);
-
- if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
- {
- win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
- win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
- }
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
- for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
- {
- add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- }
- else if(nullptr != _multi_input && nullptr != _output)
- {
- Format format = _multi_input->info()->format();
- do
- {
- Window win_uv(slice);
-
- if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
- {
- win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
- win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
-
- for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
- {
- add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
- }
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- }
- else if(nullptr != _multi_input && nullptr != _multi_output)
- {
- Format in_format = _multi_input->info()->format();
- Format out_format = _multi_output->info()->format();
- do
- {
- Window win_in_uv(slice);
- if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
- {
- win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
- win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
- win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
- }
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
- for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
- {
- add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
- }
-
- Window win_out_uv(slice);
- if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
- {
- win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
- win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
- win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
- }
-
- add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
- for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
- {
- add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-}
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 216190752c..a0f9aca54a 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,11 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
#include <map>
@@ -35,22 +40,16 @@ namespace arm_compute
namespace
{
// Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
- { ComparisonOperation::Equal, "EQUAL" },
- { ComparisonOperation::NotEqual, "NOTEQUAL" },
- { ComparisonOperation::Greater, "GREATER" },
- { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
- { ComparisonOperation::Less, "LESS" },
- { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+ {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"},
+ {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+ {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"},
};
-int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
-{
- return 16 / input.element_size();
-}
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo &input1,
+ const ITensorInfo &input2,
+ const ITensorInfo &output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -61,7 +60,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured output
- if(output.total_size() > 0)
+ if (output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -73,45 +72,37 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input1.element_size(), output.dimension(0));
// Auto initialize output if not initialized
auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
- AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+ Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
}
} // namespace
-CLComparisonKernel::CLComparisonKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
}
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -127,17 +118,29 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
const std::string &operation_name = supported_comparison_ops.at(operation);
std::string kernel_name = "compare_" + lower_string(operation_name);
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input1->info()->element_size(), output->info()->dimension(0));
+
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+ build_opts.emplace(
+ "-DVEC_SIZE_IN1=" + //
+ support::cpp11::to_string(input1->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+ build_opts.emplace(
+ "-DVEC_SIZE_IN2=" + //
+ support::cpp11::to_string(input2->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
build_opts.emplace("-DOP=" + operation_name);
build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
- if(is_data_type_quantized(input1->info()->data_type()))
+ if (is_data_type_quantized(input1->info()->data_type()))
{
const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
+ build_opts.emplace("-DIS_QUANTIZED");
build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -161,12 +164,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
_config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
}
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
@@ -182,17 +189,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
bool can_collapse = true;
const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
{
can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
{
can_collapse = (in_shape1[d] == in_shape2[d]);
}
}
bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -213,16 +221,7 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
-BorderSize CLComparisonKernel::border_size() const
-{
- const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
-
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
new file mode 100644
index 0000000000..2fb4ba06b6
--- /dev/null
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the comparison kernel. */
+class CLComparisonKernel : public ICLKernel
+{
+public:
+ /** Default constructor. */
+ CLComparisonKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComparisonKernel(const CLComparisonKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComparisonKernel &operator=(const CLComparisonKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLComparisonKernel(CLComparisonKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLComparisonKernel &operator=(CLComparisonKernel &&) = default;
+ /** Default destructor */
+ ~CLComparisonKernel() = default;
+ /** Set the inputs and output tensors
+ *
+ * @param[in] input1 Source tensor. Data types supported: All.
+ * @param[in] input2 Source tensor. Data types supported: Same as @p input1.
+ * @param[out] output Destination tensor. Data types supported: U8.
+ * @param[in] operation Comparison operation to use.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+ /** Set the inputs and output tensors
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input1 Source tensor. Data types supported: All.
+ * @param[in] input2 Source tensor. Data types supported: Same as @p input1.
+ * @param[out] output Destination tensor. Data types supported: U8.
+ * @param[in] operation Comparison operation to use.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
+ *
+ * @param[in] input1 Source tensor. Data types supported: All.
+ * @param[in] input2 Source tensor. Data types supported: Same as @p input1.
+ * @param[in] output Destination tensor. Data types supported: U8.
+ * @param[in] operation Comparison operation to use.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1; /**< Source tensor 1 */
+ const ICLTensor *_input2; /**< Source tensor 2 */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif // ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index 9670fae36a..0000000000
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-CLConvertFullyConnectedWeightsKernel::CLConvertFullyConnectedWeightsKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
-}
-
-void CLConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialisation if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
-
- ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
-
- _input = input;
- _output = output;
-
- const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
- const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
- const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
- const unsigned int num_channels = original_input_shape[channel_idx];
-
- const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
- const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
- build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
- build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- ICLKernel::configure_internal(win);
-}
-
-Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-void CLConvertFullyConnectedWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, window);
- add_2D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
deleted file mode 100644
index 2476180ba4..0000000000
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int max_matrix_size = 81;
-} // namespace
-
-/****************************************************************************************\
- * Square Convolution *
-\****************************************************************************************/
-
-template <unsigned int matrix_size>
-BorderSize CLConvolutionKernel<matrix_size>::border_size() const
-{
- return BorderSize(matrix_size / 2);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(conv == nullptr);
-
- _input = input;
- _output = output;
-
- std::stringstream kernel_name;
- CLBuildOptions build_opts;
- kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
-
- if(scale == 0)
- {
- scale = calculate_matrix_scale(conv, matrix_size);
- }
-
- for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
- {
- std::stringstream mat_str;
- mat_str << "-DMAT" << i << "=" << conv[i];
- build_opts.add_option(mat_str.str());
- }
-
- build_opts.add_option("-DSCALE=" + support::cpp11::to_string(scale));
-
- DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
- std::stringstream out_type;
- out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
- build_opts.add_option(out_type.str());
-
- _kernel = create_kernel(compile_context, kernel_name.str(), build_opts.options());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_rows_read_per_iteration = matrix_size;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
-
-/****************************************************************************************\
- * Separable Convolution *
-\****************************************************************************************/
-template <unsigned int matrix_size>
-CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
- : _border_size(0)
-{
-}
-
-template <unsigned int matrix_size>
-BorderSize CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
-{
- return _border_size;
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, border_undefined);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionHorKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-
- ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
-
- _input = input;
- _output = output;
- _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
-
- // Set build options
- std::set<std::string> build_opts;
-
- std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
- memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
- for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
- {
- build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
- }
-
- build_opts.insert("-DSCALE=0");
-
- build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- const std::string kernel_name = "convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static";
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
-
- Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-template <unsigned int matrix_size>
-BorderSize CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
-{
- return BorderSize{ matrix_size / 2, 0 };
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
- const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined, data_type);
-}
-
-template <unsigned int matrix_size>
-void CLSeparableConvolutionVertKernel<matrix_size>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
- ARM_COMPUTE_ERROR_ON(scale == 0);
-
- _input = input;
- _output = output;
-
- std::set<std::string> build_opts;
-
- std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
- memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t));
-
- for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
- {
- build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
- }
-
- build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
- build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-
- build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
-
- std::stringstream out_type;
- out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
- build_opts.insert(out_type.str());
-
- // Create kernel
- const std::string kernel_name = "convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static";
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = matrix_size;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-/****************************************************************************************\
- * Rectangle Convolution *
-\****************************************************************************************/
-
-CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
- : _border_size(0), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLConvolutionRectangleKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, width, height, scale, border_undefined);
-}
-
-void CLConvolutionRectangleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(nullptr == conv);
- ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
- ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
- ARM_COMPUTE_ERROR_ON(0 == scale);
-
- _input = input;
- _output = output;
- _border_size = BorderSize(height / 2, width / 2);
-
- std::set<std::string> options;
-
- std::stringstream output_type;
- output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
- options.insert(output_type.str());
-
- uint32_t matrix_size = width * height;
-
- std::array<int16_t, max_matrix_size> mat = { 0 };
-
- memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
-
- for(unsigned int j = 0; j < max_matrix_size; j++)
- {
- options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
- }
-
- options.insert("-DSCALE=" + support::cpp11::to_string(scale));
-
- DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
- options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
- options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width));
- options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height));
-
- _kernel = create_kernel(compile_context, "convolution_rectangle", options);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- const unsigned int num_rows_read_per_iteration = height;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-template class arm_compute::CLConvolutionKernel<3>;
-template class arm_compute::CLConvolutionKernel<5>;
-template class arm_compute::CLConvolutionKernel<7>;
-template class arm_compute::CLConvolutionKernel<9>;
-template class arm_compute::CLSeparableConvolutionVertKernel<5>;
-template class arm_compute::CLSeparableConvolutionVertKernel<7>;
-template class arm_compute::CLSeparableConvolutionVertKernel<9>;
-template class arm_compute::CLSeparableConvolutionHorKernel<5>;
-template class arm_compute::CLSeparableConvolutionHorKernel<7>;
-template class arm_compute::CLSeparableConvolutionHorKernel<9>;
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
deleted file mode 100644
index a864502a5f..0000000000
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON(!padding.empty() && output_window != nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
-
- // Validate output if initialized
- if(output->total_size() != 0)
- {
- if(output_window == nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output_window->shape());
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, Window *output_window)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
-
- // Configure window
- const unsigned int vec_size_x = 16 / input->element_size();
-
- if(output_window == nullptr)
- {
- // Create and update the window (if needed)
- Window win = calculate_max_window(*input, Steps(vec_size_x));
-
- AccessWindowHorizontal input_access(input, 0, vec_size_x);
- AccessWindowHorizontal output_access(output, 0, vec_size_x);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
- }
- else
- {
- Window win = calculate_max_window(*input);
- return std::make_pair(Status{}, win);
- }
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
-{
- TensorShape input_shape = input->tensor_shape();
- TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input_shape, padding);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
-
- // Configure window
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- // Pad on the x dimension accounting for the padding offset along the same dimension
- AccessWindowHorizontal output_access(output, padding[0].first, num_elems_processed_per_iteration);
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-/** Generate the string "-DPAD= @p dim @p index @p padding"
- *
- * @param[in] dim The dimension index
- * @param[in] index Can be 0 for the start dimension and 1 for the end dimension
- * @param[in] padding The value to pad for that index/dimension pair
- *
- * @return The correct concatenated string
- */
-std::string generate_pad_string(const size_t dim, const size_t index, const size_t padding)
-{
- return "-DPAD" + support::cpp11::to_string(dim) + support::cpp11::to_string(index) + "=" + support::cpp11::to_string(padding);
-}
-
-/** Pass the padding as build option to the kernel.
- *
- * @param[in] tensor The padded tensor
- * @param[in] padding The list of the padding for each dimension
- * @param[out] build_opts The build option to which adding the padding
- */
-void add_padding_as_build_options(const PaddingList &padding, CLBuildOptions &build_opts)
-{
- size_t dim = 0;
- for(dim = 0; dim < padding.size(); dim++)
- {
- build_opts.add_option(generate_pad_string(dim, 0, padding[dim].first));
- build_opts.add_option(generate_pad_string(dim, 1, padding[dim].second));
- }
-
- while(dim < TensorShape::num_max_dimensions)
- {
- build_opts.add_option(generate_pad_string(dim, 0, 0));
- build_opts.add_option(generate_pad_string(dim, 1, 0));
- dim++;
- }
-}
-
-} // namespace
-
-CLCopyKernel::CLCopyKernel()
- : _input(nullptr), _output(nullptr), _output_window(), _has_output_window(false)
-{
-}
-
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, output_window);
-}
-
-void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, output_window));
-
- _input = input;
- _output = output;
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-
- std::pair<Status, Window> win_config;
-
- const unsigned int vec_size_x = 16 / input->info()->element_size();
-
- if(padding.empty())
- {
- // Configure window
- win_config = validate_and_configure_window(input->info(), output->info(), output_window);
-
- if(output_window != nullptr)
- {
- _has_output_window = true;
- _output_window = Window(*output_window);
- const int width_x = output_window->num_iterations(0);
- const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x);
- const bool remainder_x = width_x % vec_size_x > 0;
-
- if(multi_access_x)
- {
- _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
- win_config.second.set(Window::DimX, Window::Dimension(win_config.second.x().start(), ceil_to_multiple(win_config.second.x().end(), vec_size_x), vec_size_x));
- }
-
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(width_x - vec_size_x, 0)));
- }
- else
- {
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- }
-
- // Build kernel
- _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
- }
- else
- {
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-
- // Add compile time options
- add_padding_as_build_options(padding, build_opts);
-
- // If we are padding in the fourth dimension the kernel needs to know the depth of the
- // different cubes
- if(padding.size() == 4)
- {
- const size_t depth = input->info()->tensor_shape()[2];
- build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
- }
-
- // Build kernel
- _kernel = create_kernel(compile_context, "copy_pad_tensor", build_opts.options());
-
- // Configure window
- win_config = validate_and_configure_window_with_padding(input->info(), output->info(), padding);
- }
-
- // Validate and set the window
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding, Window *output_window)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, output_window));
-
- if(padding.empty())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), output_window).first);
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(input->clone().get(), output->clone().get(), padding).first);
- }
-
- return Status{};
-}
-
-void CLCopyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice;
-
- if(_has_output_window)
- {
- slice = window.first_slice_window_3D();
- Window out_slice = _output_window.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice) && _output_window.slide_window_slice_3D(out_slice));
- }
- else
- {
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- slice = collapsed.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
- }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
deleted file mode 100644
index 29a97bbfa4..0000000000
--- a/src/core/CL/kernels/CLCropKernel.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <map>
-
-namespace arm_compute
-{
-CLCropKernel::CLCropKernel()
- : _input(nullptr), _output(nullptr), _start(), _batch_index(0), _extrapolation_value(0)
-{
-}
-
-void CLCropKernel::configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, start, end, batch_index, extrapolation_value, output_window);
-}
-
-void CLCropKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
- float extrapolation_value, Window *output_window)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), start, end, batch_index, extrapolation_value, output_window));
-
- _input = input;
- _output = output;
- _start = start;
- _batch_index = batch_index;
- _extrapolation_value = extrapolation_value;
-
- const int vec_size_x = 4;
- // Create and update the window (if needed)
- Window win = calculate_max_window(*output->info());
-
- if(output_window != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *output_window);
- win = *output_window;
- }
-
- const int output_width_x = win.num_iterations(0);
- const bool multi_access_x = output_width_x >= vec_size_x;
- const bool remainder_x = output_width_x % vec_size_x > 0;
-
- if(multi_access_x)
- {
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
- build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
- build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
- _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
-}
-
-Status CLCropKernel::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
-{
- ARM_COMPUTE_UNUSED(extrapolation_value, output_window);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
- ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(input->dimension(1)) || start.y >= static_cast<int32_t>(input->dimension(2))
- || end.x >= static_cast<int32_t>(input->dimension(1)) || end.y >= static_cast<int32_t>(input->dimension(2)));
- ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= input->dimension(3));
- if(output_window != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_window->x().step() != 1);
- }
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 3);
- }
- return Status{};
-}
-
-void CLCropKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window in_slice = Window();
- in_slice.use_tensor_dimensions(_input->info()->tensor_shape());
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
- in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
-
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, window);
- add_argument(idx, _start.x);
- add_argument(idx, _start.y);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index a368fae43b..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,25 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
{
CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
: _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
const PadStrideInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -59,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
- for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
@@ -67,19 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
return Status{};
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -96,9 +101,9 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compi
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -117,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- switch(_data_layout)
+ switch (_data_layout)
{
case DataLayout::NCHW:
{
@@ -135,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+ } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
break;
}
case DataLayout::NHWC:
@@ -154,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
add_3D_tensor_argument(idx, _input, slice_in);
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
- }
- while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
break;
}
default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
new file mode 100644
index 0000000000..762989a836
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
+#define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Deconvolution layer kernel on OpenCL.
+ */
+class CLDeconvolutionLayerUpsampleKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLDeconvolutionLayerUpsampleKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDeconvolutionLayerUpsampleKernel(const CLDeconvolutionLayerUpsampleKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDeconvolutionLayerUpsampleKernel &operator=(const CLDeconvolutionLayerUpsampleKernel &) = delete;
+ /** Default Move Constructor. */
+ CLDeconvolutionLayerUpsampleKernel(CLDeconvolutionLayerUpsampleKernel &&) = default;
+ /** Default move assignment operator */
+ CLDeconvolutionLayerUpsampleKernel &operator=(CLDeconvolutionLayerUpsampleKernel &&) = default;
+ /** Default destructor */
+ ~CLDeconvolutionLayerUpsampleKernel() = default;
+
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
+ *
+ * @param[in] input Source tensor info. Data types supported: All.
+ * @param[in] output Destination tensor info. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ PadStrideInfo _info;
+ DataLayout _data_layout;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 7a4b7df5e2..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,22 +21,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -51,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
- if(!is_qasymm)
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S32);
+ if (!is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+ weights_info->dimension(idx_b));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
- if(bias != nullptr)
+ if (bias != nullptr)
{
- if(is_qasymm)
+ if (is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -74,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+ stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+ ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -95,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+ auto out_dims =
+ deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+ weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
- const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+ auto_init_if_empty(*output, input->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_layout(data_layout)
+ .set_quantization_info(input->quantization_info()));
Window win = calculate_max_window(*input);
@@ -107,27 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
}
} // namespace
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
- : _add_bias(false),
- _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
}
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
- const ITensorInfo *weights_info,
- const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+ output->info(), input_info, weights_info, deconv_info));
+ auto padding_info = get_padding_info({input, bias, output});
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+ auto win_config =
+ validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
const DataLayout data_layout = input_info->data_layout();
@@ -171,9 +203,14 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
_config_id += support::cpp11::to_string(output->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -189,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, collapsed);
add_3D_tensor_argument(idx, _output, collapsed);
- if(_add_bias)
+ if (_add_bias)
{
add_1D_tensor_argument(idx, _bias, collapsed);
}
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
new file mode 100644
index 0000000000..8f436b07e3
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
+#define ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
+
+#include "src/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the OpenCL kernel to be used for reshaping the tensor before returning the result of deconvolution.
+ *
+ * The input tensor to this OpenCL kernel is expected to be the result of a @ref CLGEMM operation between the Deconvolution input and the Deconvolution filter.
+ *
+ * The input tensor should have the following shape: [filter_width * filter_height * ofms, width, height, batch_size]
+ *
+ * The output tensor should have the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+ *
+ * For example, given a tensor with dimensions [4, 2, 2] this function returns a tensor with dimensions [1, 4, 4].
+ *
+ */
+class CLDeconvolutionReshapeOutputKernel : public ICLSimpleKernel
+{
+public:
+ /** Default constructor */
+ CLDeconvolutionReshapeOutputKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDeconvolutionReshapeOutputKernel(const CLDeconvolutionReshapeOutputKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDeconvolutionReshapeOutputKernel &operator=(const CLDeconvolutionReshapeOutputKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLDeconvolutionReshapeOutputKernel(CLDeconvolutionReshapeOutputKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLDeconvolutionReshapeOutputKernel &operator=(CLDeconvolutionReshapeOutputKernel &&) = default;
+ /** Default destructor */
+ ~CLDeconvolutionReshapeOutputKernel() = default;
+
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] input Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] bias Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[out] output Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+ * Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] input_info Deconvolution input tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+ */
+ void configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
+ /** Initialise the kernel's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] bias Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[out] output Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
+ * Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] input_info Deconvolution input tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel.
+ *
+ * @param[in] input GEMM output tensor info to be reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] bias (Optional) Optional bias tensor info to be added directly during the reshape operation. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] output Reshaped output tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] input_info Original input tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] weights_info Original weights tensor info output. Supported data types: same as @p input. Supported data layouts: same as @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
+ *
+ * @return a Status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ bool _add_bias;
+ const ICLTensor *_bias;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
deleted file mode 100644
index 65b603602c..0000000000
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/StringSupport.h"
-
-#include <map>
-
-using namespace arm_compute;
-
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(depth_offset);
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- // The window needs to be based on input as we copy all the depths of input
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
- return Status{};
-}
-} // namespace
-
-CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _depth_offset(0)
-{
-}
-
-void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, depth_offset, output);
-}
-
-void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), depth_offset, output->info()));
-
- _input = input;
- _output = output;
- _depth_offset = depth_offset;
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-}
-
-Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
- unsigned int depth_offset,
- const arm_compute::ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, depth_offset, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), depth_offset, output->clone().get()).first);
- return Status{};
-}
-
-void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
-
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
deleted file mode 100644
index 868d4efc51..0000000000
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input == output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
- 1,
- DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), "Input and output data types must be different");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && shift != 0, "Shift is used only with integer non-quantized inputs");
- ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
-}
-
-void CLDepthConvertLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
- set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
-
- // Get data sizes
- const size_t input_size = data_size_from_type(input->info()->data_type());
- const size_t output_size = data_size_from_type(output->info()->data_type());
-
- // Get number of elements to process per iterations
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
- build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
- build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || is_data_type_float(output->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
- build_opts.add_option_if(is_data_type_quantized(input->info()->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
-
- // Create kernel
- const std::string kernel_name = (input_size >= output_size) ? "convert_depth_down" : "convert_depth_up";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set shift arg
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg(idx++, shift);
-
- // Configure kernel
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-
- // Collapse window
- const Window &full_window = window();
- Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
- ICLKernel::configure_internal(collapsed_window);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
-
- return Status{};
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index ffd31552f1..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -46,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -60,9 +66,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -70,13 +76,19 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape(input->info(), block_shape);
+ TensorShape output_shape =
+ compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ auto padding_info = get_padding_info({input, output});
+
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
_input = input;
@@ -92,11 +104,15 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -129,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_in, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_in));
+ } while (window.slide_window_slice_3D(slice_in));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
new file mode 100644
index 0000000000..cef70c4dda
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
+#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth to space kernel */
+class CLDepthToSpaceLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLDepthToSpaceLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceLayerKernel(const CLDepthToSpaceLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceLayerKernel &operator=(const CLDepthToSpaceLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceLayerKernel(CLDepthToSpaceLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceLayerKernel &operator=(CLDepthToSpaceLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLDepthToSpaceLayerKernel() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ */
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
deleted file mode 100644
index 936cdd849c..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
-
- if(biases != nullptr)
- {
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON((biases->dimension(0) != weights->dimension(2)) && (weights->dimension(2) != 1 || biases->dimension(0) != weights->dimension(3)));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- }
-
- if(output->total_size() != 0)
- {
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
-{
- // Output auto inizialitation if not yet initialized
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
- const unsigned int conv_stride_x = conv_info.stride().first;
- const unsigned int conv_stride_y = conv_info.stride().second;
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
- // Configure kernel window
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- if(input->data_type() == DataType::F16)
- {
- kernel_name = "depthwise_convolution_3x3_f16";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_y = 3;
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 8;
- break;
- case 2:
- num_elems_read_per_iteration_x = 9;
- break;
- case 3:
- num_elems_read_per_iteration_x = 16;
- break;
- default:
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
- break;
- }
- if(is_bifrost)
- {
- if(conv_stride_x == 1 && conv_stride_y == 1)
- {
- kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16";
- num_elems_read_per_iteration_x = 8;
- num_elems_written_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_y = 4;
- }
- else if(conv_stride_x == 2 && conv_stride_y == 2)
- {
- kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16";
- num_elems_read_per_iteration_x = 10;
- num_elems_written_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_y = 2;
- }
- }
- }
- else if(input->data_type() == DataType::F32 && is_bifrost)
- {
- if(conv_stride_x == 1 && conv_stride_y == 1)
- {
- kernel_name = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32";
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 4;
- }
- else if(conv_stride_x == 2 && conv_stride_y == 2)
- {
- kernel_name = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32";
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 2;
- num_elems_written_per_iteration_y = 2;
- }
- else
- {
- kernel_name = "depthwise_convolution_3x3";
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
- num_elems_read_per_iteration_y = 3;
- }
- }
- else
- {
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type());
-
- kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3";
- kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
- kernel_name += (is_qasymm ? "_nchw" : "");
-
- num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
- num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x + (conv_stride_x > 1 ? 1 : 0);
- num_elems_read_per_iteration_y = num_elems_written_per_iteration_y + 2;
- }
- num_elems_read_per_iteration_x += (num_elems_read_per_iteration_x - 1) * (dilation.x() - 1);
- num_elems_read_per_iteration_y += (num_elems_read_per_iteration_y - 1) * (dilation.y() - 1);
-
- // Create window and update padding
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
- num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
- conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
- bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
- : _conv_stride_x(0), _conv_pad_top(0), _conv_pad_left(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, act_info, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _conv_stride_x = conv_info.stride().first;
- _conv_stride_y = conv_info.stride().second;
- _conv_pad_left = conv_info.pad_left();
- _conv_pad_top = conv_info.pad_top();
- _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- // Configure kernel window
- std::string kernel_name;
- const GPUTarget gpu_target = get_target();
-
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
- build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
- build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
- if(act_info.enabled())
- {
- const int a_val = quantize_qasymm8(act_info.a(), oq_info);
- const int b_val = quantize_qasymm8(act_info.b(), oq_info);
- const int o1 = oq_info.offset;
-
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
- }
- else
- {
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
- }
-
- build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
- build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target,
- const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- std::string kernel_name;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(),
- conv_info, depth_multiplier, gpu_target, kernel_name, dilation)
- .first);
-
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Create input window and adjust
- Window collapsed_in = collapsed;
- collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
- collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
- collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
- collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
-
- Window slice_in = collapsed_in.first_slice_window_3D();
- Window slice_out = collapsed.first_slice_window_3D();
- Window slice_weights = window.first_slice_window_3D();
- slice_weights.set_dimension_step(Window::DimX, 0);
- slice_weights.set_dimension_step(Window::DimY, 0);
-
- unsigned int idx = 3 * num_arguments_per_3D_tensor();
-
- // Set output multipliers in case of quantized data type
- if(_is_quantized)
- {
- Window slice;
- slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
- add_1D_tensor_argument(idx, _output_multipliers, slice);
- add_1D_tensor_argument(idx, _output_shifts, slice);
- }
-
- // Set biases
- if(_biases != nullptr)
- {
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- add_1D_tensor_argument(idx, _biases, slice_biases);
- }
-
- do
- {
- idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_weights);
-
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
deleted file mode 100644
index fe72260e3b..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
-
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 4);
-
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const size_t weights_width = 3;
- const size_t weights_height = 3;
-
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
- *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
- if(is_qasymm)
- {
- DepthwiseConvolutionReshapeInfo info;
- info.c0 = 4;
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_shifts->dimension(0));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
- }
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]);
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
- const size_t weights_width = 3;
- const size_t weights_height = 3;
-
- // Get convolved dimensions
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
- *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_stride_1_dilation_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
-
- const unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
- const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
- const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2;
- const unsigned int num_rows_written_per_iteration = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
-
- BorderSize border_size;
- border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
-
- AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
- ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
-
- bool window_changed = false;
-
- if(is_qasymm)
- {
- if((output_multipliers != nullptr) && (output_shifts != nullptr))
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_accessed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access, output_access, output_multipliers_access, output_shifts_access);
- }
- else
- {
- Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
- return std::make_pair(err, win);
- }
- }
- else
- {
- AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- }
-
- if(bias != nullptr)
- {
- AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
- : _num_rows_processed_per_iteration(1), _num_planes_processed_per_iteration(1)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, act_info, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr));
- auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
- (output_shifts != nullptr) ? output_shifts->info() : nullptr);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _conv_stride_y = conv_info.stride().second;
- _num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
- _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
- if(is_dot8_supported && _is_quantized)
- {
- _num_planes_processed_per_iteration = 1;
- }
-
- _border_size = BorderSize(_is_quantized && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
-
- const unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : (8 / input->info()->element_size());
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
- build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
- build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
- if(act_info.enabled())
- {
- const int a_val = quantize_qasymm8(act_info.a(), oq_info);
- const int b_val = quantize_qasymm8(act_info.b(), oq_info);
- const int o1 = oq_info.offset;
-
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
- }
- else
- {
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
- }
-
- if(is_stride_1_dilation_1)
- {
- build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(_num_rows_processed_per_iteration));
- build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
- build_opts.add_option("-DDST_DIM_2=" + support::cpp11::to_string(_output->info()->dimension(2)));
- }
- else
- {
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
- }
- build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
- "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
-
- std::string kernel_name;
- // Create kernel
- if(_is_quantized)
- {
- kernel_name = std::string("dwc_3x3_reshaped_quantized8");
- kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : "");
- kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
- kernel_name += "_nhwc";
- }
- else
- {
- kernel_name = std::string("depthwise_convolution_3x3_nhwc");
- kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
- }
-
- build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
- build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
- ICLKernel::configure_internal(win_config.second);
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += string_from_data_type(input->info()->data_type());
-}
-
-Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
- biases != nullptr ? biases->clone().get() : nullptr,
- output->clone().get(), conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr,
- (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr)
- .first);
-
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
-
- Window win = window_collapsed;
- win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
-
- // Create input window and adjust
- Window win_in = win;
- win_in.set_dimension_step(Window::DimY, _num_rows_processed_per_iteration);
- win_in.set_dimension_step(Window::DimZ, _conv_stride_y);
-
- ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
-
- Window slice_in = win_in.first_slice_window_4D();
- Window slice_out = win.first_slice_window_4D();
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-
- if(_is_quantized)
- {
- Window slice;
- slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
- slice.set_dimension_step(Window::DimX, window.x().step());
- add_1D_tensor_argument(idx, _output_multipliers, slice);
- add_1D_tensor_argument(idx, _output_shifts, slice);
- }
-
- if(_biases != nullptr)
- {
- Window win_biases;
- win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- win_biases.set_dimension_step(Window::DimX, window.x().step());
- add_1D_tensor_argument(idx, _biases, win_biases);
- }
-
- // Calculate the max_offset.
- // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
- // |******************|
- // | pad_top |
- // |******************|
- // | |
- // | plane0 |
- // | batch0 |
- // |__________________|
- // |******************| Batch 0
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane1 |
- // | batch0 |
- // |__________________|-----> max_offset
- // |******************|
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane0 |
- // | batch1 |
- // |__________________|
- // |******************| Batch 1
- // | pad_bottom |
- // | pad_top |
- // |******************|
- // | |
- // | plane1 |
- // | batch1 |
- // |__________________|
- // | pad_bottom |
- // |******************|
- const int max_offset = _input->info()->strides_in_bytes().z() * _input->info()->dimension(2) - (_input->info()->padding().bottom + _input->info()->padding().top) *
- _input->info()->strides_in_bytes().y();
- _kernel.setArg(idx, max_offset);
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- if(_is_quantized)
- {
- add_2D_tensor_argument(idx, _weights, slice_out);
- }
- else
- {
- add_3D_tensor_argument(idx, _weights, slice_out);
- }
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e6c9861c4a..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,54 +21,101 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
ARM_COMPUTE_UNUSED(dwc_info);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ bool in_place = false;
+ if (output == nullptr || output == input)
+ {
+ in_place = true;
+ output = input;
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1);
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+ (export_to_cl_image(weights) == false),
+ "Weights cannot be exported to cl_image!");
+ ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(idx_c);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
+
+ // In place restrictions
+ if (in_place)
+ {
+ const int weights_width_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const int weights_height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+ weights->tensor_shape()[weights_height_idx] != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ conv_info.pad_stride_info
+ .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+ }
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+ conv_info.dilation};
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+
+ if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+ }
const bool is_quantized = is_data_type_quantized(input->data_type());
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -78,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
}
}
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -86,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -104,80 +151,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- if(is_data_type_quantized(input->data_type()))
+ if (is_data_type_quantized(input->data_type()))
{
const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+ const UniformQuantizationInfo oq_info =
+ (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
}
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_UNUSED(dwc_info);
-
- // Get convolved dimensions
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
- const unsigned int n0 = dwc_weights_info.n0;
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(n0));
-
- // The following access windows are only valid in case of NHWC and because n0 must unit in case depth_multiplier > 1
- AccessWindowHorizontal input_access(input, 0, n0);
- AccessWindowHorizontal weights_access(weights, 0, n0);
- AccessWindowHorizontal output_access(output, 0, n0);
-
- bool window_changed = false;
-
- if(bias != nullptr)
- {
- AccessWindowHorizontal bias_access(bias, 0, n0);
- window_changed = update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
- }
- else
- {
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- }
-
- if(is_data_type_quantized(input->data_type()))
- {
- if((output_multipliers != nullptr) && (output_shifts != nullptr))
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, n0);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, n0);
- window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
- }
- else
- {
- Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
- return std::make_pair(err, win);
- }
- }
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel()
@@ -188,112 +183,200 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
_depth_multiplier(1),
_output_multipliers(nullptr),
_output_shifts(nullptr),
+ _export_input_to_cl_image(false),
+ _export_weights_to_cl_image(false),
_is_quantized(false)
{
+ _type = CLKernelType::DEPTHWISE;
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+ output_multipliers, output_shifts);
}
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const DWCWeightsKernelInfo &dwc_weights_info,
- const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers,
+ const ICLTensor *output_shifts)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
- auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
- (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input = input;
- _output = output;
- _weights = weights;
- _biases = biases;
- _depth_multiplier = depth_multiplier;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
-
- const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t weights_width = weights->info()->dimension(idx_w);
- const size_t weights_height = weights->info()->dimension(idx_h);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ if (output == nullptr)
+ {
+ // In-place
+ output = input;
+ }
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+ conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+ (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+
+ auto padding_info = get_padding_info({input, output});
+
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *(input->info()), *(weights->info()), conv_info);
+ auto_init_if_empty(*(output->info()), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_quantization_info(output->info()->quantization_info()));
+
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _biases = biases;
+ _depth_multiplier = conv_info.depth_multiplier;
+ _output_multipliers = output_multipliers;
+ _output_shifts = output_shifts;
+ _export_input_to_cl_image = dwc_info.export_input_to_cl_image;
+ _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image;
+ _is_quantized = is_data_type_quantized(input->info()->data_type());
+
+ const unsigned int n0 = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
+ const unsigned int m0 = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
+ std::string kernel_name = "";
CLBuildOptions build_opts;
- build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
- build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(_output->info()->dimension(2))));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
- build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(dwc_weights_info.n0));
- build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights_width));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights_height));
- build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-
- std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
-
- if(_is_quantized)
+
+ // Update the padding for the input/weights tensor if we can export to cl_image
+ if (_export_input_to_cl_image)
{
- const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
+ }
- build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
- build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
+ if (_export_weights_to_cl_image)
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
+ }
- // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+ // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+ const GPUTarget gpu_target = get_target();
+ const auto act_function = conv_info.act_info.activation();
+ const auto dst_data_type = _output->info()->data_type();
- if(dwc_info.activation_info.enabled())
- {
- const int a_val = quantize_qasymm8(dwc_info.activation_info.a(), oq_info);
- const int b_val = quantize_qasymm8(dwc_info.activation_info.b(), oq_info);
- const int o1 = oq_info.offset;
+ if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+ act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+ (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_opts.add_option("-cl-unsafe-math-optimizations");
+ }
+ else
+ {
+ build_opts.add_option("-cl-fast-relaxed-math");
+ }
- build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
- build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
- build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+ build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
+ build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+ // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
+ // have a case where the data types for the activation and weights are different. However, since the implementation
+ // only works when both have same data type, we have to change the offset to take into account this aspect
+ build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+ build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+ build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
+ build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
+ build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
+ build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+ build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_top()));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_left()));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().first));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().second));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(conv_info.dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
+ build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+ "-DN0_A=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
+ build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+ // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+ set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+ static_cast<int>(_weights->info()->dimension(1)),
+ static_cast<int>(_weights->info()->dimension(2))});
+
+ if (biases != nullptr)
+ {
+ build_opts.add_option(std::string("-DHAS_BIAS"));
+ build_opts.add_option(
+ std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+ }
- const float s1 = iq_info.scale;
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- }
+ if (_is_quantized)
+ {
+ kernel_name = "dwc_native_quantized_nhwc";
+ const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+
+ PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+ int zero_value_s32;
+ zero_value.get(zero_value_s32);
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_opts.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+ build_opts.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+ build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+ build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+ build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+ build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+ get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+ build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+ "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+ // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+ int a_val{};
+ int b_val{};
+ std::tie(b_val, a_val) =
+ get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+
+ build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
}
else
{
- build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.a()));
- build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
+ kernel_name = "dwc_native_fp_nhwc";
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+ build_opts.add_option_if(conv_info.act_info.enabled(),
+ "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
}
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
+ ICLKernel::configure_internal(win);
+
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
// Set config_id for enabling LWS tuning
_config_id = kernel_name;
_config_id += "_";
@@ -312,18 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
_config_id += string_from_data_type(input->info()->data_type());
}
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers,
+ const ITensorInfo *output_shifts)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
- biases != nullptr ? biases->clone().get() : nullptr,
- output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
- output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
- output_shifts != nullptr ? output_shifts->clone().get() : nullptr)
- .first);
-
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
return Status{};
}
@@ -334,37 +416,61 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
// Collapse window
Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
- Window slice_in = window.first_slice_window_4D();
- Window slice_out = window_collapsed.first_slice_window_4D();
- if(_depth_multiplier != 1)
- {
- ARM_COMPUTE_ERROR_ON(slice_out.x().step() != 1);
- slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
- }
+ Window slice = window_collapsed.first_slice_window_4D();
- unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+ cl::Image2D input_cl_image;
+ cl::Image2D weights_cl_image;
- // Set output multipliers in case of quantized data type
- if(_is_quantized)
+ if (_export_input_to_cl_image || _export_weights_to_cl_image)
{
- add_1D_tensor_argument(idx, _output_multipliers, slice_in);
- add_1D_tensor_argument(idx, _output_shifts, slice_in);
+ // Export cl_buffer to cl_image
+ if (_export_input_to_cl_image)
+ {
+ const size_t image_w = _input->info()->dimension(0) / 4;
+ const size_t image_h =
+ _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = _input->info()->strides_in_bytes()[1];
+ input_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+ _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
+
+ if (_export_weights_to_cl_image)
+ {
+ const size_t image_w = _weights->info()->dimension(0) / 4;
+ const size_t image_h =
+ _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+ const TensorShape shape2d(image_w, image_h);
+ const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1];
+ weights_cl_image =
+ create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+ _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+ }
}
- if(_biases != nullptr)
+ unsigned int idx = 0;
+ if (_export_input_to_cl_image)
{
- add_1D_tensor_argument(idx, _biases, slice_in);
+ _kernel.setArg(idx++, input_cl_image);
}
-
- do
+ add_4d_tensor_nhwc_argument(idx, _input);
+ add_4d_tensor_nhwc_argument(idx, _output);
+ if (_export_weights_to_cl_image)
+ {
+ _kernel.setArg(idx++, weights_cl_image);
+ }
+ add_4d_tensor_nhwc_argument(idx, _weights);
+ if (_is_quantized)
+ {
+ add_1D_tensor_argument(idx, _output_multipliers, slice);
+ add_1D_tensor_argument(idx, _output_shifts, slice);
+ }
+ if (_biases != nullptr)
{
- idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
+ add_1D_tensor_argument(idx, _biases, slice);
}
- while(window_collapsed.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+ enqueue(queue, *this, slice, lws_hint());
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
new file mode 100644
index 0000000000..d34a662966
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a MxN depthwise convolution. M and N are respectively the rows and columns of the filter
+ This kernel assumes that tensor for the weights is NOT reshaped (Native version) */
+class CLDepthwiseConvolutionLayerNativeKernel : public ICLKernel
+{
+public:
+ /** Default Constructor */
+ CLDepthwiseConvolutionLayerNativeKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthwiseConvolutionLayerNativeKernel(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthwiseConvolutionLayerNativeKernel &operator=(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
+ /** Initialize the function's source, destination and parameters
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
+ * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+ * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+ * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
+ * @param[in] dwc_info Depthwise convolution layer info
+ * @param[in] conv_info Convolution info (padding, stride, dilation, ...)
+ * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+ * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+ * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+ *
+ * @note: In-place is only supported when
+ * * data layout: NHWC
+ * * filter: 1x1
+ * * @p depth_multiplier: 1
+ * * strides: 1
+ * * dilation: 1
+ * * no padding
+ * * no change of data layout after configure
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
+ *
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
+ */
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ICLTensor *output_multipliers = nullptr,
+ const ICLTensor *output_shifts = nullptr);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
+ *
+ * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const DWCComputeKernelInfo &dwc_info,
+ const ConvolutionInfo &conv_info,
+ const ITensorInfo *output_multipliers = nullptr,
+ const ITensorInfo *output_shifts = nullptr);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input{};
+ const ICLTensor *_weights{};
+ const ICLTensor *_biases{};
+ ICLTensor *_output{};
+ unsigned int _depth_multiplier{0};
+ const ICLTensor *_output_multipliers{};
+ const ICLTensor *_output_shifts{};
+ bool _export_input_to_cl_image{false};
+ bool _export_weights_to_cl_image{true};
+ bool _is_quantized{false};
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
deleted file mode 100644
index d284203b76..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
- ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != 3);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), reshaped_weights_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- auto reshaped_input_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
- auto_init_if_empty(*output, reshaped_input_shape, 1, input->data_type(), input->quantization_info());
-
- Window win = calculate_max_window(*input, Steps(info.c0));
- AccessWindowHorizontal weights_access(input, 0, info.c0);
- const bool window_changed = update_window_and_padding(win, weights_access);
-
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayerReshapeWeightsKernel::CLDepthwiseConvolutionLayerReshapeWeightsKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
- auto win_config = validate_and_configure_window(input->info(), output->info(), info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- ICLKernel::configure_internal(win_config.second);
-
- _input = input;
- _output = output;
-
- // Build the kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0));
- build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0)));
- build_opts.add_option_if(info.transpose, "-DTRANSPOSE");
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
- _kernel = create_kernel(compile_context, "depthwise_convolution_reshape_weights", build_opts.options());
-}
-
-Status CLDepthwiseConvolutionLayerReshapeWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info).first);
- return Status{};
-}
-
-void CLDepthwiseConvolutionLayerReshapeWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, window);
- add_2D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
deleted file mode 100644
index ec9b5cb8e2..0000000000
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
- if(output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
- // CLDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- return std::make_tuple(Status{}, win);
-}
-} // namespace
-
-CLDequantizationLayerKernel::CLDequantizationLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const int vec_size_x = 16 / output->info()->element_size();
- const int output_width_x = output->info()->tensor_shape().x();
- const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
- // Create and update the window (if needed)
- Window win = calculate_max_window(*output->info());
- if(multi_access_x)
- {
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(input->info()->data_type());
- std::string kernel_name = "dequantization_layer";
-
- // Create kernel
- CLBuildOptions build_opts;
- if(!is_quantized_per_channel)
- {
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const int qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0;
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
- }
- else
- {
- kernel_name += "_per_channel";
- kernel_name += input->info()->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
- }
-
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
- // Create kernel name
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-}
-
-Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
- return Status{};
-}
-
-void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(_input->info()->data_type());
-
- // Collapse windo
- Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = new_window.first_slice_window_3D();
-
- if(is_quantized_per_channel)
- {
- unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
- _kernel.setArg(idx++, _input->quantization().scale->cl_buffer());
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
deleted file mode 100644
index 595ff9bdee..0000000000
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLDerivativeKernel::CLDerivativeKernel()
- : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
-{
-}
-
-BorderSize CLDerivativeKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLDerivativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_derivative_x = output_x != nullptr;
- _run_derivative_y = output_y != nullptr;
-
- if(_run_derivative_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
- }
-
- if(_run_derivative_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
- }
-
- _input = input;
- _output_x = output_x;
- _output_y = output_y;
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_derivative_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_derivative_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("derivative");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- constexpr unsigned int num_read_rows_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), 0, 0, 0, 0);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
- if(_run_derivative_x && _run_derivative_y)
- {
- // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
- input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
- }
- else if(_run_derivative_x)
- {
- // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
- input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
- }
- else if(_run_derivative_y)
- {
- input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
- }
-
- update_window_and_padding(win,
- input_access,
- output_x_access,
- output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument_if((_run_derivative_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_derivative_y), idx, _output_y, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
deleted file mode 100644
index 7cba97f839..0000000000
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-BorderSize CLDilateKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLDilateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "dilate");
-
- _input = input;
- _output = output;
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
deleted file mode 100644
index 2fcc82fbd3..0000000000
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-
- const DataLayout data_layout = input->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
- "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
- "Weights feature map dimension should match the respective input's one");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5) && std::get<0>(conv_info.stride()) > 2,
- "Strides larger than 2 not supported for 3x3 convolution.");
-
- const auto data_type = input->data_type();
-
- if(weights->dimension(width_idx) == 9)
- {
- const auto supported_data_layout = is_data_type_quantized(data_type) ? DataLayout::NCHW : DataLayout::NHWC;
- const auto error_message = std::string("Only " + string_from_data_layout(supported_data_layout) + " layout is supported for 9x9 convolution with " + string_from_data_type(
- data_type)
- + " type");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((supported_data_layout != data_layout), error_message.c_str());
- }
-
- if(biases != nullptr)
- {
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
- "Biases size and number of input feature maps should match");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
- "Biases should be one dimensional");
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- if(is_data_type_quantized(data_type))
- {
- const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- }
- return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
- DataType data_type, DataLayout data_layout)
-{
- return gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
- GPUTarget::G52, GPUTarget::G52LIT)
- && (kernel_size <= 5)
- && (conv_stride_x == 1) && (conv_stride_y == 1)
- && (data_type == DataType::F32)
- && (data_layout == DataLayout::NCHW);
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nhwc(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
- DataType data_type, DataLayout data_layout)
-{
- return gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
- GPUTarget::G52, GPUTarget::G52LIT)
- && (kernel_size == 9)
- && (conv_stride_x == 1) && (conv_stride_y == 1)
- && (data_type == DataType::F32)
- && (data_layout == DataLayout::NHWC);
-}
-
-inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
- unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
- unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
-{
- const DataType data_type = input->data_type();
- const DataLayout data_layout = input->data_layout();
- unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
- const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
- if(run_optimized_bifrost)
- {
- // Configure kernel window
- switch(kernel_size)
- {
- case 1:
- {
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 4;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 4;
- break;
- }
- case 3:
- {
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 3;
- break;
- }
- case 5:
- {
- num_elems_read_per_iteration_x = 8;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 2;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
- }
- }
- }
- else if(data_layout == DataLayout::NCHW)
- {
- num_elems_read_per_iteration_y = kernel_size;
- num_elems_written_per_iteration_x = 8;
- num_elems_written_per_iteration_y = 1;
- switch(kernel_size)
- {
- case 1:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 8;
- break;
- case 2:
- num_elems_read_per_iteration_x = 16;
- break;
- case 3:
- switch(input->element_size())
- {
- case 1:
- num_elems_read_per_iteration_x = 28;
- break;
- case 2:
- num_elems_read_per_iteration_x = 24;
- break;
- case 4:
- num_elems_read_per_iteration_x = 22;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid data size");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 3:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 10;
- break;
- case 2:
- num_elems_read_per_iteration_x = 17;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 5:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 12;
- break;
- case 2:
- num_elems_read_per_iteration_x = 20;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 9:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_x = 16;
- break;
- case 2:
- num_elems_read_per_iteration_x = 24;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid direct convolution size");
- }
- }
- else // data_layout == NHWC
- {
- const bool run_optimized_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
- num_elems_written_per_iteration_x = 1;
-
- if(run_optimized_bifrost_nhwc)
- {
- num_elems_read_per_iteration_x = 4;
- }
- else
- {
- num_elems_read_per_iteration_x = 1;
- }
-
- switch(kernel_size)
- {
- case 1:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_y = 8;
- num_elems_written_per_iteration_y = 8;
- break;
- case 2:
- num_elems_read_per_iteration_y = 16;
- num_elems_written_per_iteration_y = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 3:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_y = 10;
- num_elems_written_per_iteration_y = 8;
- break;
- case 2:
- num_elems_read_per_iteration_y = 17;
- num_elems_written_per_iteration_y = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 5:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_y = 12;
- num_elems_written_per_iteration_y = 8;
- break;
- case 2:
- num_elems_read_per_iteration_y = 20;
- num_elems_written_per_iteration_y = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- case 9:
- switch(conv_stride_x)
- {
- case 1:
- num_elems_read_per_iteration_y = 16;
- num_elems_written_per_iteration_y = 8;
- break;
- case 2:
- num_elems_read_per_iteration_y = 24;
- num_elems_written_per_iteration_y = 8;
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid convolution stride X");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented.");
- break;
- }
- }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
-{
- const DataLayout data_layout = input->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int kernel_size = weights->dimension(width_idx);
-
- // Get convolved dimensions
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
-
- // Output auto inizialitation if not yet initialized
- // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
- auto_init_if_empty(*output, output_shape,
- 1,
- input->data_type(),
- input->quantization_info());
-
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- unsigned int conv_pad_left = conv_info.pad_left();
- unsigned int conv_pad_top = conv_info.pad_top();
- unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
- setup_num_elems(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
- num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
- kernel_size, conv_info, target, input);
-
- // Create window and update padding
- bool window_changed = false;
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- if(data_layout == DataLayout::NHWC)
- {
- AccessWindowStatic input_access(input, 0, -conv_pad_left,
- ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x),
- ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y));
- AccessWindowStatic weights_access(weights, 0, 0, weights->dimension(0), weights->dimension(1));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
- }
- else if(data_layout == DataLayout::NCHW)
- {
- AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-}
-} // namespace
-
-CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
- : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
-{
-}
-
-BorderSize CLDirectConvolutionLayerKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
-}
-
-void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- _data_layout = input->info()->data_layout();
- const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
- const unsigned int kernel_size = weights->info()->dimension(width_idx);
- const DataType data_type = input->info()->data_type();
-
- // Get convolved dimensions
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
-
- // Output auto inizialitation if not yet initialized
- // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
- auto_init_if_empty(*output->info(),
- output_shape,
- 1,
- input->info()->data_type(),
- input->info()->quantization_info());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(),
- conv_info));
-
- _conv_stride_x = std::get<0>(conv_info.stride());
- _conv_stride_y = std::get<1>(conv_info.stride());
-
- if(_data_layout == DataLayout::NHWC)
- {
- _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
- }
- else if(_data_layout == DataLayout::NCHW)
- {
- _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-
- _input = input;
- _weights = weights;
- _output = output;
- _biases = biases;
-
- const GPUTarget gpu_target = get_target();
-
- std::stringstream kernel_name;
- kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
- if(_data_layout == DataLayout::NHWC)
- {
- kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
- }
-
- CLBuildOptions build_options;
- build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
-
- const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
-
- if(run_optimized_for_bifrost)
- {
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-
- kernel_name << "_f32_bifrost";
- _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
- }
- else
- {
- build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
- build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
- build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- if(_data_layout == DataLayout::NHWC)
- {
- const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
- build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
- build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
- build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
- build_options.add_option(std::string("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx))));
- build_options.add_option(std::string("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx))));
- build_options.add_option(std::string("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())));
- build_options.add_option(std::string("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())));
- build_options.add_option(std::string("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())));
- build_options.add_option(std::string("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)));
- if(run_optimized_for_bifrost_nhwc)
- {
- const unsigned int num_elems_read_per_iteration_x = 4;
- _border_size.right = num_elems_read_per_iteration_x;
- build_options.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_read_per_iteration_x));
- }
- }
- build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
- if(is_data_type_quantized(data_type))
- {
- const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
- build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "direct_convolution_quantized", build_options.options());
-
- // Set static kernel arguments
- unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
- _kernel.setArg(idx++, -iqinfo.offset);
- _kernel.setArg(idx++, -wqinfo.offset);
- _kernel.setArg(idx++, oqinfo.offset);
- }
- else
- {
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
- }
- }
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "direct_convolution_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += support::cpp11::to_string(kernel_size);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().left);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().top);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().right);
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_size().bottom);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_x);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_y);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(width_idx));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const GPUTarget target)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
-
- return Status{};
-}
-
-void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Get initial windows
- Window slice = window.first_slice_window_3D();
- Window win_in = window;
-
- win_in.adjust(Window::DimX, -_border_size.left, true);
- win_in.adjust(Window::DimY, -_border_size.top, true);
-
- const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
- win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
-
- Window slice_in = win_in.first_slice_window_3D();
- unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
- add_3D_tensor_argument(idx1, _weights, slice);
-
- if(_biases != nullptr)
- {
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
- add_1D_tensor_argument(idx1, _biases, slice_biases);
- }
-
- _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
deleted file mode 100644
index 5c74579184..0000000000
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
-
- // Validate in case of configured output
- if(output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
- }
-
- return Status{};
-}
-} // namespace
-
-void CLElementWiseUnaryLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, op);
-}
-
-void CLElementWiseUnaryLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
-
- // Configure kernel window
- _input = input;
- _output = output;
-
- const std::string kernel_name = "elementwise_unary";
- const int vec_size_x = 16 / output->info()->element_size();
- const int output_width_x = output->info()->tensor_shape().x();
- const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
- Window win = calculate_max_window(*output->info());
- if(multi_access_x)
- {
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- // Set kernel build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
- switch(op)
- {
- case ElementWiseUnary::RSQRT:
- build_opts.add_option("-DOPERATION=rsqrt_op");
- break;
- case ElementWiseUnary::EXP:
- build_opts.add_option("-DOPERATION=exp_op");
- break;
- case ElementWiseUnary::NEG:
- build_opts.add_option("-DOPERATION=neg_op");
- break;
- case ElementWiseUnary::SIN:
- build_opts.add_option("-DOPERATION=sin_op");
- break;
- case ElementWiseUnary::ABS:
- build_opts.add_option("-DOPERATION=fabs_op");
- break;
- case ElementWiseUnary::LOG:
- build_opts.add_option("-DOPERATION=natural_log_op");
- break;
- case ElementWiseUnary::ROUND:
- build_opts.add_option("-DOPERATION=round_op");
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-}
-
-Status CLElementWiseUnaryLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
-
- return Status{};
-}
-
-void CLElementWiseUnaryLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
deleted file mode 100644
index 00a97d50e9..0000000000
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-#include <map>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
- { ArithmeticOperation::ADD, "ADD" },
- { ArithmeticOperation::SUB, "SUB" },
- { ArithmeticOperation::DIV, "DIV" },
- { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
- { ArithmeticOperation::MIN, "MIN" },
- { ArithmeticOperation::MAX, "MAX" },
- { ArithmeticOperation::POWER, "POWER" },
- { ArithmeticOperation::PRELU, "PRELU" },
-};
-
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
- { ArithmeticOperation::ADD, "ADD" },
- { ArithmeticOperation::SUB, "SUB" },
-};
-
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
-{
- std::string config_id;
- // Set config_id for enabling LWS tuning
- config_id = kernel_name;
- config_id += "_";
- config_id += lower_string(string_from_data_type(input1.data_type()));
- config_id += "_";
- config_id += support::cpp11::to_string(output.dimension(0));
- config_id += "_";
- config_id += support::cpp11::to_string(output.dimension(1));
- return config_id;
-}
-
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&input1, &input2, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-
- const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
-
- const bool is_quantized = is_data_type_quantized(input1.data_type()) || is_data_type_quantized(input2.data_type());
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-
- if(is_data_type_quantized_symmetric(input1.data_type()))
- {
- const int32_t in1_offset = input1.quantization_info().uniform().offset;
- const int32_t in2_offset = input2.quantization_info().uniform().offset;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
- }
- }
-
- const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
-
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
-
- if(is_data_type_quantized_symmetric(output.data_type()))
- {
- const int32_t offset = output.quantization_info().uniform().offset;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
- }
- }
- }
- return Status{};
-}
-
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, const std::string &operation_string)
-{
- CLBuildOptions build_opts;
-
- build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type()));
- build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DOP=" + operation_string);
- if(is_data_type_quantized(input1.data_type()))
- {
- const UniformQuantizationInfo iq1info = input1.quantization_info().uniform();
- const UniformQuantizationInfo iq2info = input2.quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = output.quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset));
- build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset));
- build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale));
- build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
- }
- return build_opts;
-}
-
-std::pair<Status, Window> configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
- AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- set_shape_if_empty(output, out_shape);
-
- if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
- {
- set_format_if_unknown(output, Format::S16);
- }
- else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
- {
- set_format_if_unknown(output, Format::F16);
- }
- else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
- {
- set_format_if_unknown(output, Format::F32);
- }
- else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(output, DataType::QASYMM8);
- }
- else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
- }
- else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(output, DataType::QSYMM16);
- }
-
- return configure_window_arithmetic_common(valid_region, input1, input2, output);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
- auto_init_if_empty(output, out_shape, 1, input1.data_type());
- return configure_window_arithmetic_common(valid_region, input1, input2, output);
-}
-} // namespace
-
-CLElementwiseOperationKernel::CLElementwiseOperationKernel()
- : _act_info(), _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure_common(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- std::string kernel_name = "elementwise_operation_" + name();
- if(is_data_type_quantized(input1->info()->data_type()))
- {
- kernel_name += "_quantized";
- }
-
- // Set kernel build options
- CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info());
- if(_act_info.enabled())
- {
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b()));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- ICLKernel::configure_internal(win_config.second);
-
- _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info());
-}
-
-void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
-
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice, lws_hint());
-
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLElementwiseOperationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
-
-/** Arithmetic operations with saturation*/
-
-void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
- const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, policy, act_info);
-}
-
-void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
- const ConvertPolicy &policy,
- const ActivationLayerInfo &act_info)
-{
- _policy = policy;
- _op = op;
- _act_info = act_info;
- configure_common(compile_context, input1, input2, output);
-}
-
-Status CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(op, policy);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- return Status{};
-}
-
-std::pair<Status, Window> CLSaturatedArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
-}
-
-Status CLSaturatedArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- return validate_arguments_with_arithmetic_rules(input1, input2, output);
-}
-
-CLBuildOptions CLSaturatedArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- const bool has_float_out = is_data_type_float(output.data_type());
- auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
- build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
- return build_options;
-}
-std::string CLSaturatedArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
-{
- auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
- config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
- config_id += lower_string(string_from_data_layout(input1.data_layout()));
- return config_id;
-}
-
-std::string CLSaturatedArithmeticOperationKernel::name()
-{
- return supported_sat_arithmetic_ops[_op];
-}
-
-/** Arithmetic operations*/
-
-void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, act_info);
-}
-
-void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
- const ActivationLayerInfo &act_info)
-{
- _op = op;
- _act_info = act_info;
- configure_common(compile_context, input1, input2, output);
-}
-
-Status CLArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- if(op == ArithmeticOperation::DIV || op == ArithmeticOperation::POWER)
- {
- // Division and Power operators don't support integer arithmetic
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*input1, *input2, *output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*input1->clone(), *input2->clone(), *output->clone()).first);
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- return Status{};
-}
-std::pair<Status, Window> CLArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
- {
- // Division and Power operators don't support integer arithmetic
- return validate_and_configure_window_for_division(input1, input2, output);
- }
- else
- {
- return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
- }
-}
-Status CLArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
- {
- // Division and Power operators don't support integer arithmetic
- return validate_arguments_with_float_only_supported_rules(input1, input2, output);
- }
- else
- {
- return validate_arguments_with_arithmetic_rules(input1, input2, output);
- }
-}
-
-CLBuildOptions CLArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
- return generate_build_options_with_arithmetic_rules(input1, input2, output, name());
-}
-std::string CLArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
-{
- return generate_id_for_tuning_common(kernel_name, input1, output);
-}
-
-std::string CLArithmeticOperationKernel::name()
-{
- return supported_arithmetic_ops[_op];
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
deleted file mode 100644
index 6cb5ffc8cc..0000000000
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-BorderSize CLErodeKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLErodeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "erode");
-
- _input = input;
- _output = output;
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_rows_read_pes_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 03e6ee7dfb..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,31 +21,37 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -55,32 +61,42 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *output,
+ ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_UNUSED(idx, config);
auto_init_if_empty(*output, input->clone()->set_num_channels(2));
Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
} // namespace
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
- : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
}
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+ auto padding_info = get_padding_info({input, output, idx});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
_input = input;
@@ -90,6 +106,7 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(input->info()->num_channels()));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(config.conjugate, "-DCONJ");
std::string kernel_name = "fft_digit_reverse_axis_" + support::cpp11::to_string(config.axis);
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -107,12 +124,17 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
return Status{};
}
@@ -132,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _output, slice);
add_1D_tensor_argument(idx, _idx, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
new file mode 100644
index 0000000000..fdd1bcc3d3
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
+#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the digit reverse operation kernel. */
+class CLFFTDigitReverseKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLFFTDigitReverseKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTDigitReverseKernel(const CLFFTDigitReverseKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTDigitReverseKernel &operator=(const CLFFTDigitReverseKernel &) = delete;
+ /** Default Move Constructor. */
+ CLFFTDigitReverseKernel(CLFFTDigitReverseKernel &&) = default;
+ /** Default move assignment operator */
+ CLFFTDigitReverseKernel &operator=(CLFFTDigitReverseKernel &&) = default;
+ /** Default destructor */
+ ~CLFFTDigitReverseKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] idx Digit reverse index tensor. Data type supported: U32
+ * @param[in] config Kernel configuration.
+ */
+ void
+ configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] idx Digit reverse index tensor. Data type supported: U32
+ * @param[in] config Kernel configuration.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *idx,
+ const FFTDigitReverseKernelInfo &config);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] idx Digit reverse index tensor info. Data type supported: U32
+ * @param[in] config Kernel configuration.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_idx;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 63c093958c..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,17 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
@@ -42,13 +45,13 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output, *input);
}
@@ -69,18 +73,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
steps.set(config.axis, config.radix);
Window win = calculate_max_window(*input, steps);
- if(output != nullptr)
- {
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
return std::make_pair(Status{}, win);
}
} // namespace
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
@@ -88,10 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -99,6 +104,7 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
// Create build options
CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
// Create kernel
@@ -109,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments if not the first stage
- if(!config.is_first_stage)
+ if (!config.is_first_stage)
{
const unsigned int Ni = config.Nx * config.radix;
const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_uint>(idx++, config.Nx);
_kernel.setArg<cl_uint>(idx++, Ni);
_kernel.setArg<cl_float>(idx, exp_const);
@@ -132,23 +139,25 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const FFTRadixStageKernelInfo &config)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get(),
- config)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+ .first);
return Status{};
}
std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
{
- return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+ return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
}
void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -163,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
new file mode 100644
index 0000000000..de80bfced3
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
+#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+#include <set>
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the FFT radix stage kernel. */
+class CLFFTRadixStageKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLFFTRadixStageKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTRadixStageKernel(const CLFFTRadixStageKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTRadixStageKernel &operator=(const CLFFTRadixStageKernel &) = delete;
+ /** Default Move Constructor. */
+ CLFFTRadixStageKernel(CLFFTRadixStageKernel &&) = default;
+ /** Default move assignment operator */
+ CLFFTRadixStageKernel &operator=(CLFFTRadixStageKernel &&) = default;
+ /** Default destructor */
+ ~CLFFTRadixStageKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @note If the output tensor is nullptr, the FFT will be performed in-place
+ *
+ * @param[in,out] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input
+ * @param[in] config FFT descriptor metadata.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+ /** Set the input and output tensors.
+ *
+ * @note If the output tensor is nullptr, the FFT will be performed in-place
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input
+ * @param[in] config FFT descriptor metadata.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTRadixStageKernelInfo &config);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32.
+ * @param[in] output Destination tensor info. Can be nullptr. Data type supported: same as @p input
+ * @param[in] config FFT descriptor metadata.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
+ /** Returns the radix that are support by the FFT kernel
+ *
+ * @return A set of supported radix
+ */
+ static std::set<unsigned int> supported_radix();
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 4738a12b49..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,14 +21,17 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -38,10 +41,10 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -50,30 +53,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- if(output != nullptr)
- {
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
- // CLFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- }
-
- return std::make_pair(Status{}, win);
-}
} // namespace
-CLFFTScaleKernel::CLFFTScaleKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
@@ -81,10 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
@@ -93,19 +81,28 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+ : input->info()->num_channels()));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(config.conjugate, "-DCONJ");
std::string kernel_name = "fft_scale_conj";
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set static arguments
- unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ unsigned int idx =
+ (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_float>(idx, config.scale);
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ if (output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
_config_id = kernel_name;
@@ -115,13 +112,13 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config)
{
ARM_COMPUTE_UNUSED(config);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
@@ -138,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
+ if (!_run_in_place)
{
add_3D_tensor_argument(idx, _output, slice);
}
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
new file mode 100644
index 0000000000..b995282e02
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
+#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for the inverse fft scale kernel. */
+class CLFFTScaleKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLFFTScaleKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTScaleKernel(const CLFFTScaleKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFFTScaleKernel &operator=(const CLFFTScaleKernel &) = delete;
+ /** Default Move Constructor. */
+ CLFFTScaleKernel(CLFFTScaleKernel &&) = default;
+ /** Default move assignment operator */
+ CLFFTScaleKernel &operator=(CLFFTScaleKernel &&) = default;
+ /** Default destructor */
+ ~CLFFTScaleKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in,out] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] config Kernel configuration
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Source tensor. Data types supported: F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] config Kernel configuration
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const FFTScaleKernelInfo &config);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] config Kernel configuration
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFFTSCALEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
deleted file mode 100644
index 4c2086c1c6..0000000000
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLFastCornersKernel::CLFastCornersKernel()
- : ICLKernel(), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize CLFastCornersKernel::border_size() const
-{
- return BorderSize(3);
-}
-
-void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, non_max_suppression, border_mode);
-}
-
-void CLFastCornersKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
-
- _input = input;
- _output = output;
-
- // Create build options
- std::set<std::string> build_opts;
-
- if(non_max_suppression)
- {
- build_opts.emplace("-DUSE_MAXSUPPRESSION");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("fast_corners");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
- _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- constexpr unsigned int num_elems_read_per_iteration = 7;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
-
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(non_max_suppression);
- _config_id += "_";
- _config_id += lower_string(string_from_border_mode(border_mode));
-}
-
-void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLCopyToArrayKernel::CLCopyToArrayKernel()
- : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
-{
-}
-
-void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, update_number, corners, num_buffers);
-}
-
-void CLCopyToArrayKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(corners == nullptr);
- ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
-
- _input = input;
- _corners = corners;
- _num_buffer = num_buffers;
-
- std::set<std::string> build_opts;
-
- if(update_number)
- {
- build_opts.emplace("-DUPDATE_NUMBER");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("copy_to_keypoint");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- //Get how many pixels skipped in the x dimension in the previous stages
- unsigned int offset = _input->info()->valid_region().anchor.x();
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
- _kernel.setArg<unsigned int>(idx++, _corners->max_num_values());
- _kernel.setArg<cl_uint>(idx++, offset);
- _kernel.setArg(idx++, *_num_buffer);
- _kernel.setArg(idx++, _corners->cl_buffer());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- //Initialise the _num_buffer as it used as both input and output
- static const unsigned int zero_init = 0;
- queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index b2f06b34bb..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,30 +21,26 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <cstdint>
-#include <set>
-#include <sstream>
-#include <string>
-using namespace arm_compute;
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
-CLFillBorderKernel::CLFillBorderKernel()
- : ICLKernel(), _tensor(nullptr)
+namespace arm_compute
+{
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
bool CLFillBorderKernel::is_parallelisable() const
@@ -60,20 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
ICLKernel::add_argument<T>(idx, static_cast<T>(value));
}
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
}
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
+{
+ _tensor = tensor;
+ configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
+}
+
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
- ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+ ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
+ auto padding_info = get_padding_info({tensor});
- border_size.limit(tensor->info()->padding());
+ border_size.limit(tensor->padding());
// If there is no border: early exit
- if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+ if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
{
return;
}
@@ -81,11 +95,11 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT
// Select appropriate kernel
std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
- const DataType dt = tensor->info()->data_type();
+ const DataType dt = tensor->data_type();
// Define build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
@@ -93,28 +107,24 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- _tensor = tensor;
// Create static kernel arguments
- const unsigned int valid_width = tensor->info()->valid_region().shape[0];
- const unsigned int valid_height = tensor->info()->valid_region().shape[1];
- const cl_int2 valid_region_coords =
- {
- {
- static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
- static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
- }
- };
- const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+ const unsigned int valid_width = tensor->valid_region().shape[0];
+ const unsigned int valid_height = tensor->valid_region().shape[1];
+ const cl_int2 valid_region_coords = {{
+ static_cast<cl_int>(tensor->valid_region().anchor[0]),
+ static_cast<cl_int>(tensor->valid_region().anchor[1]),
+ }};
+ const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
// Set static kernel arguments
unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
ICLKernel::add_argument<cl_uint>(idx, valid_width);
ICLKernel::add_argument<cl_uint>(idx, valid_height);
ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
- if(BorderMode::CONSTANT == border_mode)
+ if (BorderMode::CONSTANT == border_mode)
{
- switch(dt)
+ switch (dt)
{
case DataType::U8:
case DataType::QASYMM8:
@@ -154,7 +164,7 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT
Window win;
win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
- win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
+ win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ);
ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
@@ -162,17 +172,43 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT
_config_id += "_";
_config_id += lower_string(string_from_data_type(dt));
_config_id += "_";
- _config_id += support::cpp11::to_string(tensor->info()->dimension(0));
+ _config_id += support::cpp11::to_string(tensor->dimension(0));
_config_id += "_";
- _config_id += support::cpp11::to_string(tensor->info()->dimension(1));
+ _config_id += support::cpp11::to_string(tensor->dimension(1));
_config_id += "_";
_config_id += lower_string(string_from_border_mode(border_mode));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ // Border mode undefined or border width == 0
+ if (_kernel() == nullptr)
+ {
+ return;
+ }
+
+ const auto tensor =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, tensor, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
}
void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
{
// Border mode undefined or border width == 0
- if(_kernel() == nullptr)
+ if (_kernel() == nullptr)
{
return;
}
@@ -188,6 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
new file mode 100644
index 0000000000..5782143cf9
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFILLBORDERKERNEL_H
+#define ARM_COMPUTE_CLFILLBORDERKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the border of a kernel */
+class CLFillBorderKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLFillBorderKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFillBorderKernel(const CLFillBorderKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLFillBorderKernel(CLFillBorderKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
+ /** Default destructor */
+ ~CLFillBorderKernel() = default;
+
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+ * @param[in] border_size Size of the border to fill in elements.
+ * @param[in] border_mode Border mode to use for the convolution.
+ * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+ * @param[in] border_size Size of the border to fill in elements.
+ * @param[in] border_mode Border mode to use for the convolution.
+ * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+ */
+ void configure(ICLTensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+ * @param[in] border_size Size of the border to fill in elements.
+ * @param[in] border_mode Border mode to use for the convolution.
+ * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
+
+ /** Function to set the constant value on fill border kernel depending on type.
+ *
+ * @param[in] idx Index of the kernel argument to set.
+ * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
+ */
+ template <class T>
+ void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ bool is_parallelisable() const override;
+
+private:
+ ICLTensor *_tensor;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFILLBORDERKERNEL_H */
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
deleted file mode 100644
index bf2c891169..0000000000
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
-
- Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
-
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
- return std::make_pair(Status{}, win);
-}
-} // namespace
-
-CLFlattenLayerKernel::CLFlattenLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLFlattenLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLFlattenLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option_if(output->info()->num_dimensions() > 2, "-DDST_DIM1=" + support::cpp11::to_string(output->info()->dimension(1)));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "flatten", build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = "flatten";
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
- return Status{};
-}
-
-void CLFlattenLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window collapsed_window = window.collapse(ICLKernel::window(), Window::DimZ);
-
- Window output_window;
- output_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- // Run kernel
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, collapsed_window);
- add_3D_tensor_argument(idx, _output, output_window);
- enqueue(queue, *this, collapsed_window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
deleted file mode 100644
index 9b2133d796..0000000000
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- auto_init_if_empty(*output, *input);
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLFloorKernel::CLFloorKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLFloorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Auto initialize output
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
-
- // Validate
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "floor_layer", build_opts);
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLFloorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-Status CLFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLFloorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index fd03e8379c..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,27 +21,35 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -52,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
- if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+ if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
{
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
}
else
{
- const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t channel_idx =
+ get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
}
// Validate bias
- if(input_bias != nullptr)
+ if (input_bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
}
// Validate beta
- if(bn_beta != nullptr)
+ if (bn_beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
}
// Validate gamma
- if(bn_gamma != nullptr)
+ if (bn_gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
}
// Validate output weights
- if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ if (fused_weights != nullptr && fused_weights->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
}
// Validate output bias
- if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ if (fused_bias != nullptr && fused_bias->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -99,26 +108,53 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
} // namespace
CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
- : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
- _run_in_place_weights(false), _run_in_place_bias(false)
+ : _input_weights(nullptr),
+ _input_bias(nullptr),
+ _bn_mean(nullptr),
+ _bn_var(nullptr),
+ _bn_gamma(nullptr),
+ _bn_beta(nullptr),
+ _fused_weights(nullptr),
+ _fused_bias(nullptr),
+ _epsilon(),
+ _run_in_place_weights(false),
+ _run_in_place_bias(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
+ auto padding_info =
+ get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
+
_input_weights = input_weights;
_input_bias = input_bias;
_bn_mean = bn_mean;
@@ -130,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
_epsilon = epsilon;
_run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
- _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+ _run_in_place_bias =
+ (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
// Auto initialize outputs
- if(_fused_weights != nullptr)
+ if (_fused_weights != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
}
- if(_fused_bias != nullptr)
+ if (_fused_bias != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
}
// Validate arguments
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
- (fused_weights != nullptr) ? fused_weights->info() : nullptr,
- (fused_bias != nullptr) ? fused_bias->info() : nullptr,
- (input_bias != nullptr) ? input_bias->info() : nullptr,
- (bn_beta != nullptr) ? bn_beta->info() : nullptr,
- (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
- epsilon, fbn_type));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+ fbn_type));
// Configure kernel window
Window win = calculate_max_window(*input_weights->info());
@@ -160,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
- build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+ build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+ "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -171,14 +208,23 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
// Create kernel
_kernel = create_kernel(compile_context, "fuse_batchnormalization_layer", build_opts.options());
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
return Status{};
}
@@ -195,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
// Add kernel arguments
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input_weights, slice_3d);
- if(_input_bias != nullptr)
+ if (_input_bias != nullptr)
{
add_1D_tensor_argument(idx, _input_bias, slice_1d);
}
add_1D_tensor_argument(idx, _bn_mean, slice_1d);
add_1D_tensor_argument(idx, _bn_var, slice_1d);
- if(!_run_in_place_weights)
+ if (!_run_in_place_weights)
{
add_3D_tensor_argument(idx, _fused_weights, slice_3d);
}
- if(!_run_in_place_bias)
+ if (!_run_in_place_bias)
{
add_1D_tensor_argument(idx, _fused_bias, slice_1d);
}
- if(_bn_beta != nullptr)
+ if (_bn_beta != nullptr)
{
add_1D_tensor_argument(idx, _bn_beta, slice_1d);
}
- if(_bn_gamma != nullptr)
+ if (_bn_gamma != nullptr)
{
add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
}
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
new file mode 100644
index 0000000000..76ec7a759f
--- /dev/null
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** OpenCL kernel to fuse the batch normalization node to a preceding convolution node */
+class CLFuseBatchNormalizationKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLFuseBatchNormalizationKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFuseBatchNormalizationKernel(const CLFuseBatchNormalizationKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFuseBatchNormalizationKernel &operator=(const CLFuseBatchNormalizationKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLFuseBatchNormalizationKernel(CLFuseBatchNormalizationKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLFuseBatchNormalizationKernel &operator=(CLFuseBatchNormalizationKernel &&) = default;
+ /** Default destructor */
+ ~CLFuseBatchNormalizationKernel() = default;
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+ * @param[in] bn_mean Batch normalization layer mean tensor. Same as @p input_weights
+ * @param[in] bn_var Batch normalization layer variance tensor. Same as @p input_weights
+ * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
+ * @param[out] fused_bias Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+ * @param[in] input_bias (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+ * @param[in] bn_beta (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_beta is set to 0.0
+ * @param[in] bn_gamma (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_gamma is set to 1.0
+ * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+ * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+ */
+ void configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+ * @param[in] bn_mean Batch normalization layer mean tensor. Same as @p input_weights
+ * @param[in] bn_var Batch normalization layer variance tensor. Same as @p input_weights
+ * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
+ * @param[out] fused_bias Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+ * @param[in] input_bias (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+ * @param[in] bn_beta (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_beta is set to 0.0
+ * @param[in] bn_gamma (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_gamma is set to 1.0
+ * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+ * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
+ *
+ * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
+ * @param[in] bn_mean Batch normalization layer mean tensor info. Same as @p input_weights
+ * @param[in] bn_var Batch normalization layer variance tensor info. Same as @p input_weights
+ * @param[in] fused_weights Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
+ * @param[in] fused_bias Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
+ * @param[in] input_bias (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
+ * @param[in] bn_beta (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_beta is set to 0.0
+ * @param[in] bn_gamma (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
+ * @note if nullptr, bn_gamma is set to 1.0
+ * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
+ * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input_weights;
+ const ICLTensor *_input_bias;
+ const ICLTensor *_bn_mean;
+ const ICLTensor *_bn_var;
+ const ICLTensor *_bn_gamma;
+ const ICLTensor *_bn_beta;
+ ICLTensor *_fused_weights;
+ ICLTensor *_fused_bias;
+ float _epsilon;
+ bool _run_in_place_weights;
+ bool _run_in_place_bias;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 9e0594b129..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- if(input0->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_output_as_3d ? gemm_info.m() : input0->dimension(1);
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- input0->dimension(0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
- std::string kernel_name("gemmlowp_mm_native");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k());
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
-}
-
-Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 76303cfd16..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = gemm_info.m();
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
- input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
- _k = gemm_info.k();
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
- std::string kernel_name("gemmlowp_mm_reshaped_");
- kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
- kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k());
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 11f45e894a..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,577 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- if(input0->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
- const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
- const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
- const int m = gemm_info.m;
- const int n = gemm_info.n;
- const int k = gemm_info.k;
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- if(output_stage.type == GEMMLowpOutputStageType::NONE)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
- }
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
- "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
- // Checks performed if the output stage needs to be fused
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(gemm_info.a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]);
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(gemm_info.b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if mm result is a 3D reinterpretation
- const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2]));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]);
-
- if(expected_output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- TensorShape collapsed_output_shape(expected_output_shape);
- collapsed_output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx],
- "vector_sum_row must have the same number of batches of output tensor");
-
- if(gemm_info.a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
- }
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
- if(output_multipliers != nullptr && output_shifts != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(output_stage.is_quantized_per_channel)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0));
- }
- }
- }
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
- ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
- if(output_stage.type != GEMMLowpOutputStageType::NONE)
- {
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type));
- }
- else
- {
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32));
- }
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
- num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_output_as_3d ? gemm_info.m : input0->dimension(1);
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- ceil_to_multiple(input0->dimension(0), gemm_info.lhs_info.k0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0,
- input1->dimension(0),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
-
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- if(gemm_info.a_offset != 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
- }
- // No access window needed for vector_sum_row
- ARM_COMPUTE_UNUSED(vector_sum_row);
-
- if(bias != nullptr)
- {
- AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, bias_access);
- }
-
- if(output_multipliers != nullptr && output_multipliers->dimension(0) > 1)
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
- window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
- }
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
- : _input0(nullptr),
- _input1(nullptr),
- _output(nullptr),
- _vector_sum_col(nullptr),
- _vector_sum_row(nullptr),
- _bias(nullptr),
- _output_multipliers(nullptr),
- _output_shifts(nullptr),
- _slide_matrix_b(true),
- _reinterpret_input_as_3d(false),
- _reinterpret_output_as_3d(false),
- _use_dummy_work_items(false),
- _is_quantized_per_channel(false),
- _fuse_output_stage(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
- const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
- const GEMMKernelInfo &gemm_info,
- const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(),
- input1->info(),
- output->info(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output_multipliers != nullptr ? output_multipliers->info() : nullptr,
- output_shifts != nullptr ? output_shifts->info() : nullptr));
-
- const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info;
- const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info;
- const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
- const int32_t a_offset = gemm_info.a_offset;
- const int32_t b_offset = gemm_info.b_offset;
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _bias = bias;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(),
- input1->info(),
- output->info(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output_multipliers != nullptr ? output_multipliers->info() : nullptr,
- output_shifts != nullptr ? output_shifts->info() : nullptr,
- num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
- std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
-
- if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- kernel_name += "_fused_output_stage_fixedpoint";
- _fuse_output_stage = true;
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
- build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
- const int min = output_stage.gemmlowp_min_bound;
- const int max = output_stage.gemmlowp_max_bound;
-
- PixelValue min_val{};
- PixelValue max_val{};
- std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
- build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
- const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- gemm_info,
- vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
- bias != nullptr ? bias->clone().get() : nullptr,
- output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
- output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- idx++;
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- idx++;
- }
-
- if(_fuse_output_stage)
- {
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
- }
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index dc8eb76c23..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- int32_t a_offset, int32_t b_offset)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
- }
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
- TensorShape output_shape = mm_result->tensor_shape();
- if(output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
- "mm_result tensor must have the same number of batches of output tensor");
-
- if(a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
- int32_t a_offset, int32_t b_offset)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 4;
- bool window_changed = false;
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
- if(a_offset != 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
- }
- if(b_offset != 0)
- {
- AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
- window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
-{
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
- int32_t b_offset)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
- int32_t k, int32_t a_offset,
- int32_t b_offset)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- a_offset, b_offset)); // NOLINT
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _bias = bias;
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
- build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
- build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- std::string kernel_name("gemmlowp_offset_contribution");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- a_offset, b_offset); // NOLINT
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name + "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-}
-
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- int32_t a_offset, int32_t b_offset)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
- vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
- bias != nullptr ? bias->clone().get() : nullptr,
- a_offset, b_offset)
- .first); // NOLINT
-
- return Status{};
-}
-
-void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _mm_result, slice);
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index 26b318b0fd..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
- if(output_stage.is_quantized_per_channel)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
- }
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
- }
-
- // If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
- // Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
- ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
- TensorShape output_shape = mm_result->tensor_shape();
- if(output_shape.num_dimensions() > 1)
- {
- const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
- TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
- vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(output_batch_idx);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
- "mm_result tensor must have the same number of batches of output tensor");
-
- if(a_offset != 0)
- {
- TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
- vector_sum_col_shape.collapse_from(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
- }
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
- // Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 4;
- bool window_changed = false;
-
- // Auto initialize the output
- auto_init_if_empty(*output, mm_result->clone()->set_data_type(output_stage.output_data_type));
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_access);
-
- if(a_offset != 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
- }
- if(b_offset != 0)
- {
- AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
- window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- if(output_multipliers->dimension(0) > 1)
- {
- AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
- : _mm_result(nullptr),
- _vector_sum_col(nullptr),
- _vector_sum_row(nullptr),
- _bias(nullptr),
- _output(nullptr),
- _output_multipliers(nullptr),
- _output_shifts(nullptr),
- _is_quantized_per_channel(false)
-{
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
- int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
- const ICLTensor *bias, ICLTensor *output,
- int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- a_offset, b_offset, output_stage,
- output_multipliers->info(), output_shifts->info())); // NOLINT
-
- const int min = output_stage.gemmlowp_min_bound;
- const int max = output_stage.gemmlowp_max_bound;
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _bias = bias;
- _output = output;
- _output_multipliers = output_multipliers;
- _output_shifts = output_shifts;
- _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
-
- // If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
- {
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
- }
- // If b_offset == 0, vector_sum_row can be a nullptr
- build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
- build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
- build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
- build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- PixelValue min_val{};
- PixelValue max_val{};
- std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
- build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
- std::string kernel_name("gemmlowp_offset_contribution");
- kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- a_offset, b_offset, output_stage,
- output_multipliers->info(), output_shifts->info()); // NOLINT
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name + "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-}
-
-Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
- const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
- const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
- vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
- bias != nullptr ? bias->clone().get() : nullptr,
- output->clone().get(),
- a_offset, b_offset, output_stage,
- output_multipliers->clone().get(), output_shifts->clone().get())
- .first); // NOLINT
-
- return Status{};
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Set window for vector_sum_col
- Window win_vector_sum_col = slice;
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Set window for vector_sum_row
- Window win_vector_sum_row = slice;
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window biases_slice = slice;
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _mm_result, slice);
- add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
- add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
- add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
- add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index f9f4839688..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
- ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
- || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win,
- input_access);
-
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-class Coordinates;
-CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo *info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- const GEMMLowpOutputStageInfo *info)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- auto min = info->gemmlowp_min_bound;
- auto max = info->gemmlowp_max_bound;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
- build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info->output_data_type);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 2db7d6d22b..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
- || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win,
- input_access);
-
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} //namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr)
-{
-}
-Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (bias != nullptr) ? bias->info() : nullptr,
- output->info(),
- output_stage));
-
- _input = input;
- _bias = bias;
- _output = output;
- _output_stage = output_stage;
-
- // Set the arguments to pass at compile time
- auto min = output_stage->gemmlowp_min_bound;
- auto max = output_stage->gemmlowp_max_bound;
- CLBuildOptions build_opts;
- build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
- build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
- build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
- "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
- "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), output_stage->output_data_type);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 2306b009bd..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QSYMM16));
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access);
-
- if(output->total_size() != 0)
- {
- Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
-
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (bias != nullptr) ? bias->clone().get() : nullptr,
- output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift,
- int min, int max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift,
- int min, int max)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
- min, max));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
- build_opts.add_option_if((min > -32768), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < 32767), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index b4a7cc9d90..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8_SIGNED));
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access);
-
- if(output->total_size() != 0)
- {
- Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (bias != nullptr) ? bias->clone().get() : nullptr,
- output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input = input;
- _bias = bias;
- _output = output;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift));
- build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if((min > -128), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < 127), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options());
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 3158d59948..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
- // Check biases if exist
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access);
-
- if(output->total_size() != 0)
- {
- Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (bias != nullptr) ? bias->clone().get() : nullptr,
- output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input = input;
- _bias = bias;
- _output = output;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift));
- build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
- build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
- build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
- build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
- build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options());
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Create input window
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- // Setup bias slice
- unsigned int idx1 = num_arguments_per_3D_tensor();
- if(_bias != nullptr)
- {
- Window biases_slice(slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
- add_1D_tensor_argument(idx1, _bias, biases_slice);
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx1, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
deleted file mode 100644
index 44f8797cff..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
- }
- return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
- }
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
- : _input(), _output()
-{
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info);
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
-
- _input = mtx_a;
- _output = vector_sum_row;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type()));
- build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
- std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- // This kernel does not need padding
- Window win = calculate_max_window(*vector_sum_row->info(), Steps());
- ICLKernel::configure_internal(win);
-
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(_input->info()->dimension(2));
-}
-
-Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
- Window slice_in = collapsed.first_slice_window_2D();
- Window slice_out = collapsed.first_slice_window_2D();
-
- // Setup input slice. Its dimensions are increased in the cl kernel.
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info);
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
- _input = mtx_b;
- _output = vector_sum_col;
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
- build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
- build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type()));
- build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
- Window slice_out = collapsed.first_slice_window_2D();
- Window slice_in = slice_out;
-
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
deleted file mode 100644
index 03cd1878aa..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
- unsigned int &num_elems_processed_per_iteration)
-{
- // Select the vector size to use (8 for Bifrost; 16 for Midgard).
- bool is_gpu_bifrost = gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
- GPUTarget::G52, GPUTarget::G52LIT);
- num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
-
- // Configure kernel window
- Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1));
- AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, biases_access, accum_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
- : _accum(nullptr), _biases(nullptr)
-{
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
-{
- configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
-
- _biases = biases;
- _accum = accum;
-
- // Get the target gpu
- GPUTarget gpu_target = get_target();
- unsigned int vector_size = 0;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "gemm_accumulate_biases", build_opts.options());
-}
-
-Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target)
-{
- unsigned int num_elems_processed_per_iteration = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first);
-
- return Status{};
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window accum_slice = window.first_slice_window_2D();
-
- Window biases_slice(accum_slice);
- biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- // Run kernel
- do
- {
- // Set arguments
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _accum, accum_slice);
- add_1D_tensor_argument(idx, _biases, biases_slice);
-
- enqueue(queue, *this, accum_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(accum_slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
deleted file mode 100644
index d2c79543ad..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (input2 != nullptr)
- && (!reshape_info.broadcast_bias()),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
-
- if(!is_interleaved_transposed)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int m = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1);
- const unsigned int n = input1->dimension(0);
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(reshape_info.broadcast_bias())
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
- }
- else
- {
- GEMMRHSMatrixInfo rhs_info;
- GEMMLHSMatrixInfo lhs_info;
- const auto m = static_cast<unsigned int>(reshape_info.m());
- const auto n = static_cast<unsigned int>(reshape_info.n());
- const int k = reshape_info.k();
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
- rhs_info.n0 = 16 / input1->element_size();
- rhs_info.k0 = 1;
- rhs_info.h0 = mult_transpose1xW_width;
- rhs_info.interleave = false;
- rhs_info.transpose = false;
- lhs_info.m0 = 4;
- lhs_info.k0 = 4;
- lhs_info.v0 = mult_interleave4x4_height;
- lhs_info.interleave = true;
- lhs_info.transpose = true;
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(reshape_info.broadcast_bias())
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
- float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
- ElementsProcessed &num_elements_processed)
-{
- ARM_COMPUTE_UNUSED(beta);
- bool window_changed = false;
- Window win{};
- Window win_out{};
-
- const DataType data_type = input0->data_type();
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- if(is_interleaved_transposed)
- {
- // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
- ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- num_elems_processed_per_iteration_y = 4;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reshape_info.m();
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
- }
- else // The input tensors have not been reshaped
- {
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
- num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- // Create kernels according to the architecture, data type and input size.
- GPUTarget arch_target = get_arch_from_target(gpu_target);
- if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
- {
- num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
- }
-
- // Configure window
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),
- _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
-}
-
-void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
- is_interleaved_transposed, reshape_info, fp_mixed_precision));
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
- _add_bias = _input2 != nullptr;
- _broadcast_bias = reshape_info.broadcast_bias();
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
-
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- const DataType data_type = input0->info()->data_type();
-
- // Get target architecture
- GPUTarget gpu_target = get_target();
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
- gpu_target, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- CLBuildOptions build_opts;
-
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
- build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
- build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
-
- const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
- std::string kernel_name;
- if(is_interleaved_transposed)
- {
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
- build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
- build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
- build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
-
- if(is_data_type_float(data_type) && is_bifrost)
- {
- kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- }
- else
- {
- kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
- if(fp_mixed_precision && data_type == DataType::F16)
- {
- // currently wider accumulator is only supported for fp16 kernels.
- kernel_name += "_acc32";
- }
- }
- }
- else // The input tensors have not been reshaped
- {
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-
- // Create kernels according to the architecture, data type and input size.
- if(is_data_type_float(data_type) && is_bifrost)
- {
- kernel_name = "gemm_mm_floating_point";
-
- if(input0->info()->num_dimensions() != 1)
- {
- kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- if(fp_mixed_precision && data_type == DataType::F16)
- {
- // currently wider accumulator is only supported for fp16 kernels.
- kernel_name += "_acc32";
- }
- }
- else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
- {
- // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
- // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
- // FC6 and FC7 of AlexNet and VGG-16).
- kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
- }
-
- // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
- // via exhaustive autotuning over a range of representative layer configurations.
- set_lws_hint(cl::NDRange(4));
- }
- else // (MIDGARD and F32) or (F16)
- {
- kernel_name = "gemm_mm_floating_point";
- }
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = "gemm_";
- _config_id += (is_interleaved_transposed ? "reshaped_" : "");
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(3));
- _config_id += "_";
- _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
-}
-
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
- // Note: num_elements_processed will be set in validate_and_configure_window()
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(activation_info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- (input2 != nullptr) ? input2->clone().get() : nullptr,
- output->clone().get(),
- beta,
- is_interleaved_transposed,
- reshape_info,
- gpu_target,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- if(_add_bias)
- {
- add_2D_tensor_argument(idx, _input2, slice);
- }
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index dce8d81ca8..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != n);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != k);
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
- }
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const unsigned int m = reinterpret_output_as_3d ? gemm_info.m : output->dimension(1);
- const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- input0->dimension(0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyNativeKernel::CLGEMMMatrixMultiplyNativeKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
- _add_bias(false), _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
- const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
- std::string kernel_name("gemm_mm_native");
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
-}
-
-Status CLGEMMMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4;
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- }
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- if(_add_bias)
- {
- add_2D_tensor_argument(idx, _input2, slice);
- }
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 09e4e98a87..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (input0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- TensorShape tensor_shape0{ input0->tensor_shape() };
- tensor_shape0.set(0, k);
- tensor_shape0.set(1, m);
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const unsigned int m = gemm_info.m;
- const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
- input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false), _add_bias(false),
- _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _k = gemm_info.k;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- const bool enable_mixed_precision = gemm_info.fp_mixed_precision;
- const DataType data_type = input0->info()->data_type();
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
- build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-
- std::string kernel_name("gemm_mm_reshaped_");
- kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
- kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-}
-
-Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 5;
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 4;
- }
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument_if((_add_bias), idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 8e194d5139..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (input2 != nullptr)
- && (!gemm_info.broadcast_bias),
- "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-
- const unsigned int m = gemm_info.m;
- const unsigned int n = gemm_info.n;
- const unsigned int k = gemm_info.k;
-
- TensorShape tensor_shape1{ input1->tensor_shape() };
- tensor_shape1.set(0, n);
- tensor_shape1.set(1, k);
-
- if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
- {
- const unsigned int input2_dim0 = input2->dimension(0);
- const unsigned int input2_dim1 = input2->dimension(1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input0);
- if(gemm_info.broadcast_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
- }
- }
-
- const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != k);
- if(gemm_info.reinterpret_input_as_3d)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != m);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != m);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info,
- const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Configure kernel window
- num_elems_processed_per_iteration_x = rhs_info.n0;
- num_elems_processed_per_iteration_y = lhs_info.m0;
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const unsigned int m = reinterpret_output_as_3d ? gemm_info.m : output->dimension(1);
- const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0,
- input0->dimension(0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0,
- input1->dimension(0),
- input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- if(input2 != nullptr)
- {
- const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
- const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
- AccessWindowStatic input2_access(input2, 0, 0,
- ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
-
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
- : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
- _add_bias(false), _broadcast_bias(false)
-{
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
- float alpha,
- float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
- _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
- _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
- _add_bias = _input2 != nullptr;
- _broadcast_bias = gemm_info.broadcast_bias;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), input2 != nullptr ? input2->info() : nullptr, output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
- const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
- const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
- build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
- build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
- std::string kernel_name("gemm_mm_reshaped_only_rhs_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_add_bias ? "add_bias_" : "");
- _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gemm_info.k);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.n0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.h0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(rhs_info.interleave);
-}
-
-Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
- const GEMMLHSMatrixInfo &lhs_info,
- const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, alpha, beta, lhs_info, rhs_info, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- input2 != nullptr ? input2->clone().get() : nullptr,
- output->clone().get(),
- lhs_info,
- rhs_info,
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4;
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- }
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- unsigned int idx0;
- if(_add_bias)
- {
- idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- else
- {
- idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- }
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument_if((_add_bias), idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- if(_add_bias)
- {
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
- }
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
deleted file mode 100644
index 4e57259cd6..0000000000
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_read_per_iteration = 4;
-constexpr unsigned int num_rows_read_per_iteration = 4;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
- const unsigned int border_x = ceil_to_multiple(input0->dimension(0), num_elems_read_per_iteration) - input0->dimension(0);
- const unsigned int border_y = ceil_to_multiple(input0->dimension(1), num_rows_read_per_iteration) - input0->dimension(1);
-
- Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
-
- AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0) + border_x, output->dimension(1) + border_y);
-
- bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
-
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMMatrixVectorMultiplyKernel::CLGEMMMatrixVectorMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _num_rows_read_per_iteration(0), _border_size(0)
-{
-}
-BorderSize CLGEMMMatrixVectorMultiplyKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output);
-}
-
-void CLGEMMMatrixVectorMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
-
- // Check if is a quantized operation
- const bool is_quantized = is_data_type_quantized_asymmetric(_input0->info()->data_type());
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
-
- std::string kernel_name = is_quantized ? std::string("gemm_mv_quantized") : std::string("gemm_mv");
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Add static arguments
- if(is_quantized)
- {
- const UniformQuantizationInfo iq0_info = _input0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq1_info = _input1->info()->quantization_info().uniform();
-
- unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor() + num_arguments_per_1D_tensor();
- _kernel.setArg<int>(idx++, -iq0_info.offset);
- _kernel.setArg<int>(idx++, -iq1_info.offset);
- }
-
- // Configure kernel window
- _num_rows_read_per_iteration = num_rows_read_per_iteration;
-
- const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
- const unsigned int border_y = ceil_to_multiple(input0->info()->dimension(1), _num_rows_read_per_iteration) - input0->info()->dimension(1);
-
- _border_size = BorderSize(border_y, border_x);
-
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLGEMMMatrixVectorMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_3D();
- Window slice_in2 = window.first_slice_window_3D();
- Window slice_out = window.first_slice_window_3D();
-
- // Setup input0 slice
- slice_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
- slice_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1) + border_size().bottom, _num_rows_read_per_iteration));
- slice_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
-
- // Setup input1 and output slice. Their dimensions are increased in the cl kernel.
- slice_in2.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in2.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in2.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- unsigned int idx_1 = num_arguments_per_3D_tensor();
-
- add_2D_tensor_argument(idx_1, _input1, slice_in2);
-
- do
- {
- unsigned int idx_0 = 0;
- unsigned int idx_2 = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
- add_3D_tensor_argument(idx_0, _input0, slice_in);
- add_1D_tensor_argument(idx_2, _output, slice_out);
- enqueue(queue, *this, slice_in, lws_hint());
- }
- while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
deleted file mode 100644
index 3267a0e39e..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
- const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
- bool window_changed = false;
-
- TensorInfo tmp_info(*input);
-
- if(reinterpret_input_as_3d)
- {
- // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(input->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
-
- // Configure window
- // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
- const int bottom_pad = ceil_to_multiple(m, num_elems_processed_per_iteration_y) - m;
-
- Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input_access(input, 0, 0,
- ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
- input->dimension(1) + bottom_pad);
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
- window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
- update_window_and_padding(win, output_access); // window used to update the padding requirements of output tensor
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeLHSMatrixKernel::CLGEMMReshapeLHSMatrixKernel()
- : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
-{
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, lhs_info, reinterpret_input_as_3d);
-}
-
-void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
-
- _input = input;
- _output = output;
- _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
- build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
- std::string kernel_name("gemm_reshape_lhs_matrix_");
- kernel_name += lhs_info.transpose ? "t" : "nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), lhs_info, reinterpret_input_as_3d);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "gemm_reshape_lhs_matrix_";
- _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.m0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.k0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.v0);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.interleave);
- _config_id += "_";
- _config_id += support::cpp11::to_string(lhs_info.transpose);
-}
-
-Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, lhs_info, reinterpret_input_as_3d));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
- return Status{};
-}
-
-void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 2 * num_arguments_per_3D_tensor();
- const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
deleted file mode 100644
index 43e7b92c6a..0000000000
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
- ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && ((rhs_info.n0 != 4) || input->data_type() != DataType::F32), "Export to cl_image only supported with n0 = 4 and F32 data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image
- && !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0), "Impossible to retrieve the cl_image pitch alignment");
-
- if(rhs_info.export_to_cl_image)
- {
- TensorShape output_shape = compute_rhs_reshaped_shape(*input, rhs_info);
-
- // Check the width and height of the output tensor.
- // Since we cannot create a 3d image from a buffer, the third dimension is collapsed with the second dimension
- size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
- size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape[0] > max_image_w * 4, "Not supported width for cl_image");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape[1] * output_shape[2] > max_image_h, "Not supported height for cl_image");
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
- const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
- bool window_changed = false;
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*input, rhs_info)));
-
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
- window_changed = update_window_and_padding(win, input_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- if(rhs_info.export_to_cl_image)
- {
- constexpr unsigned int num_floats_per_pixel = 4;
-
- const unsigned int stride_y_in_elements = output->strides_in_bytes()[1] / output->element_size();
- const unsigned int pixel_aligment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
- const unsigned int row_pitch_alignment = pixel_aligment * num_floats_per_pixel;
- const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
- const unsigned int padding = round_up_width - stride_y_in_elements;
-
- output->extend_padding(PaddingSize(0, padding, 0, 0));
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMReshapeRHSMatrixKernel::CLGEMMReshapeRHSMatrixKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, rhs_info);
-}
-
-void CLGEMMReshapeRHSMatrixKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), rhs_info));
-
- _input = input;
- _output = output;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
- build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
- build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
- build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
-
- std::string kernel_name("gemm_reshape_rhs_matrix_");
- kernel_name += rhs_info.transpose ? "t" : "nt";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), rhs_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGEMMReshapeRHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, rhs_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), rhs_info).first);
-
- return Status{};
-}
-
-void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 2cb8f2380a..904bb07282 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
#include <string>
@@ -33,20 +36,22 @@ namespace arm_compute
{
namespace
{
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
- ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() + indices->num_dimensions() - 1) > 4);
+
ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -55,26 +60,27 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
// Output auto initialization if not yet initialized
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
auto_init_if_empty((*output), output_shape, 1, input->data_type());
// Create window
Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
} // namespace
-CLGatherKernel::CLGatherKernel()
- : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
@@ -82,9 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
}
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ auto padding_info = get_padding_info({input, output, indices});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
// Configure kernel window
@@ -98,20 +109,25 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
// Set build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DINDICES_DIMS=" + support::cpp11::to_string(indices->info()->num_dimensions()));
build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+ build_opts.add_option("-DINDEX_LIMIT=" + support::cpp11::to_string(input->info()->tensor_shape()[_axis]));
// Create kernel
_kernel = create_kernel(compile_context, "gather", build_opts.options());
ICLKernel::configure_internal(win_config.second);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
return Status{};
}
@@ -123,7 +139,7 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, window_collapsed);
- add_1D_tensor_argument(idx, _indices, window_collapsed);
+ add_4D_tensor_argument(idx, _indices, window_collapsed);
add_4D_tensor_argument(idx, _output, window_collapsed);
enqueue(queue, *this, window_collapsed, lws_hint());
}
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
new file mode 100644
index 0000000000..db4b49d2f5
--- /dev/null
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGATHERKERNEL_H
+#define ARM_COMPUTE_CLGATHERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to perform tensor reshaping */
+class CLGatherKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLGatherKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGatherKernel(const CLGatherKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGatherKernel &operator=(const CLGatherKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGatherKernel(CLGatherKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGatherKernel &operator=(CLGatherKernel &&) = default;
+ /** Default destructor */
+ ~CLGatherKernel() = default;
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All.
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+ */
+ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All.
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis = 0);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
+ *
+ * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[in] output Destination tensor info. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ const ICLTensor *_indices; /**< Indices tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ int _axis; /**< Axis index */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGATHERKERNEL_H */
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
deleted file mode 100644
index 210ffb9123..0000000000
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLGaussian3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input = input;
- _output = output;
-
- // Set build options
- std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
- "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
- "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
- "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
- };
-
- // Create kernel
- _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
deleted file mode 100644
index cb864671db..0000000000
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-
-#include <cstdint>
-
-using namespace arm_compute;
-
-void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
- // Set arguments
- CLSeparableConvolution5x5HorKernel::configure(compile_context, input, output, matrix.data(), border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLGaussian5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- const uint32_t scale = 256;
- const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
-
- // Set arguments
- CLSeparableConvolution5x5VertKernel::configure(compile_context, input, output, matrix.data(), scale, border_undefined);
-}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
deleted file mode 100644
index 73dbda22f3..0000000000
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
- : _l2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidHorKernel::border_size() const
-{
- return BorderSize{ 0, 2 };
-}
-
-void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
-
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
-
- _input = input;
- _output = output;
-
- // Create kernel
- const std::string kernel_name = std::string("gaussian1x5_sub_x");
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- constexpr unsigned int num_elems_read_per_iteration = 20;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- const float scale_x = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
-
- Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
-
- // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
- // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
- // a pixel is even or odd is determined based on the tensor shape not the
- // valid region!)
- // Thus the offset from which the first pixel (L2) for the convolution is
- // loaded depends on the anchor and shape of the valid region.
- // In the case of an even shape (= even image width) we need to load L2
- // from -2 if the anchor is odd and from -1 if the anchor is even. That
- // makes sure that L2 is always loaded from an odd pixel.
- // On the other hand, for an odd shape (= odd image width) we need to load
- // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
- // achieve the opposite effect.
- // The condition can be simplified to checking whether anchor + shape is
- // odd (-2) or even (-1) as only adding an odd and an even number will have
- // an odd result.
- _l2_load_offset = -border_size().left;
-
- if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
- {
- _l2_load_offset += 1;
- }
-
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
- output_access);
-
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window win_in(window);
- win_in.shift(Window::DimX, _l2_load_offset);
-
- //The output is half the width of the input:
- Window win_out(window);
- win_out.scale(Window::DimX, 0.5f);
-
- Window slice_in = win_in.first_slice_window_2D();
- Window slice_out = win_out.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}
-
-CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
- : _t2_load_offset(0)
-{
-}
-
-BorderSize CLGaussianPyramidVertKernel::border_size() const
-{
- return BorderSize{ 2, 0 };
-}
-
-void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLGaussianPyramidVertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
-
- _input = input;
- _output = output;
-
- // Create kernel
- const std::string kernel_name = std::string("gaussian5x1_sub_y");
- _kernel = create_kernel(compile_context, "gaussian5x1_sub_y");
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_rows_processed_per_iteration = 2;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 8;
- constexpr unsigned int num_rows_per_iteration = 5;
-
- const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
-
- // Determine whether we need to load even or odd rows. See above for a
- // detailed explanation.
- _t2_load_offset = -border_size().top;
-
- if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
- {
- _t2_load_offset += 1;
- }
-
- update_window_and_padding(win,
- AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
- output_access);
-
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
- ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
-
- Window win_in(window);
- win_in.shift(Window::DimY, _t2_load_offset);
-
- Window win_out(window);
- win_out.scale(Window::DimY, 0.5f);
-
- Window slice_in = win_in.first_slice_window_2D();
- Window slice_out = win_out.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
-}
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 8baac18bf6..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,19 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
- if(all_anchors->total_size() > 0)
+ if (all_anchors->total_size() > 0)
{
size_t feature_height = info.feat_height();
size_t feature_width = info.feat_width();
@@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
- if(is_data_type_quantized(anchors->data_type()))
+ if (is_data_type_quantized(anchors->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
}
@@ -66,19 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
}
} // namespace
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
- : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
}
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
+ auto padding_info = get_padding_info({anchors, all_anchors});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
// Metadata
@@ -89,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
// Initialize the output if empty
const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
- auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+ auto_init_if_empty(*all_anchors->info(),
+ TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
// Set instance variables
_anchors = anchors;
@@ -106,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
- if(is_quantized)
+ if (is_quantized)
{
const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -114,17 +122,21 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
}
// Create kernel
- const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+ const std::string kernel_name =
+ (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
// This means we don't need to pad on the X dimension, as we know in advance how many fields
// compose the struct.
Window win = calculate_max_window(*all_anchors->info(), Steps(info.values_per_roi()));
ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors,
+ const ITensorInfo *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
new file mode 100644
index 0000000000..e08f281d6c
--- /dev/null
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
+#define ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for Compute All Anchors kernel */
+class CLComputeAllAnchorsKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLComputeAllAnchorsKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeAllAnchorsKernel(const CLComputeAllAnchorsKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeAllAnchorsKernel &operator=(const CLComputeAllAnchorsKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLComputeAllAnchorsKernel(CLComputeAllAnchorsKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLComputeAllAnchorsKernel &operator=(CLComputeAllAnchorsKernel &&) = default;
+ /** Default destructor */
+ ~CLComputeAllAnchorsKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] anchors Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+ * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+ * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+ *
+ */
+ void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] anchors Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+ * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+ * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *anchors,
+ ICLTensor *all_anchors,
+ const ComputeAnchorsInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
+ *
+ * @param[in] anchors Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
+ * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
+ * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
+ *
+ * @return a Status
+ */
+ static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_anchors;
+ ICLTensor *_all_anchors;
+};
+} // namespace arm_compute
+#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
deleted file mode 100644
index e58b62e9de..0000000000
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
- : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
-{
-}
-
-void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input_magnitude, input_phase, output, hog_info);
-}
-
-void CLHOGOrientationBinningKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
- ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
- ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
-
- _input_magnitude = input_magnitude;
- _input_phase = input_phase;
- _output = output;
- _cell_size = hog_info->cell_size();
-
- float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
- phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
-
- std::stringstream args_str;
- args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
- args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
- args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
- args_str << "-DPHASE_SCALE=" << phase_scale << " ";
-
- // Construct kernel name
- std::set<std::string> build_opts = {};
- build_opts.insert(args_str.str());
-
- // Create kernel
- const std::string kernel_name = std::string("hog_orientation_binning");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- constexpr unsigned int num_elems_read_per_iteration = 1;
- const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height;
- constexpr unsigned int num_elems_written_per_iteration = 1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
- AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
- output_access);
-
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input_magnitude->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- // Compute slice for the magnitude and phase tensors
- Window slice_mag_phase = window.first_slice_window_2D();
- slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
- slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
- add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
- add_2D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
- : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
-{
-}
-
-void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, hog_info);
-}
-
-void CLHOGBlockNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
-{
- ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
-
- // Number of cells per block
- const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
- hog_info->block_size().height / hog_info->cell_size().height);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
-
- // Number of cells per block stride
- const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
- hog_info->block_stride().height / hog_info->cell_size().height);
-
- _input = input;
- _output = output;
- _num_cells_per_block_stride = num_cells_per_block_stride;
-
- std::stringstream args_str;
- args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
- args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
- args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
- args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
- args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
- args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
- args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
- args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
-
- // Construct kernel name
- std::set<std::string> build_opts = {};
- build_opts.insert(args_str.str());
-
- const std::string kernel_name = std::string("hog_block_normalization");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- constexpr unsigned int num_elems_read_per_iteration = 1;
- const unsigned int num_rows_read_per_iteration = num_cells_per_block.height;
- constexpr unsigned int num_elems_written_per_iteration = 1;
- const unsigned int num_rows_written_per_iteration = num_cells_per_block.height;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
- output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- // Compute slice for the magnitude and phase tensors
- Window slice_in = window.first_slice_window_2D();
- slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
- slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice_in);
- add_2D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
deleted file mode 100644
index bee9744aad..0000000000
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLHOGDetectorKernel::CLHOGDetectorKernel()
- : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
-{
-}
-
-void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
- float threshold, uint16_t idx_class)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetectorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
- const Size2D &detection_window_stride,
- float threshold, uint16_t idx_class)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
- ARM_COMPUTE_ERROR_ON(hog == nullptr);
- ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
- ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
- ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
- ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
-
- const Size2D &detection_window_size = hog->info()->detection_window_size();
- const Size2D &block_size = hog->info()->block_size();
- const Size2D &block_stride = hog->info()->block_stride();
-
- _input = input;
- _detection_windows = detection_windows;
- _num_detection_windows = num_detection_windows;
-
- const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
- const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
-
- ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
-
- std::stringstream args_str;
- args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
- args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
- args_str << "-DTHRESHOLD=" << threshold << " ";
- args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
- args_str << "-DIDX_CLASS=" << idx_class << " ";
- args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
- args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
- args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " ";
- args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " ";
-
- // Construct kernel name
- std::set<std::string> build_opts = {};
- build_opts.insert(args_str.str());
-
- // Create kernel
- const std::string kernel_name = std::string("hog_detector");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
- _kernel.setArg(idx++, hog->cl_buffer());
- _kernel.setArg(idx++, detection_windows->cl_buffer());
- _kernel.setArg(idx++, *_num_detection_windows);
-
- // Get the number of blocks along the x and y directions of the input tensor
- const ValidRegion &valid_region = input->info()->valid_region();
- const size_t num_blocks_x = valid_region.shape[0];
- const size_t num_blocks_y = valid_region.shape[1];
-
- // Get the number of blocks along the x and y directions of the detection window
- const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
- const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
-
- const size_t window_step_x = detection_window_stride.width / block_stride.width;
- const size_t window_step_y = detection_window_stride.height / block_stride.height;
-
- // Configure kernel window
- Window win;
- win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
- win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
-
- constexpr unsigned int num_elems_read_per_iteration = 1;
- const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y;
-
- update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
deleted file mode 100644
index 313d95fb03..0000000000
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLHarrisScoreKernel::CLHarrisScoreKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
-{
-}
-
-BorderSize CLHarrisScoreKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
- int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
- bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, block_size, norm_factor, strength_thresh, sensitivity, border_undefined);
-}
-
-void CLHarrisScoreKernel::configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
- int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
- ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
- _sensitivity = sensitivity;
- _strength_thresh = strength_thresh;
- _norm_factor = norm_factor;
- _border_size = BorderSize(block_size / 2);
-
- // Select kernel
- std::stringstream harris_score_kernel_name;
- harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
-
- // Create build options
- std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
-
- // Create kernel
- _kernel = create_kernel(compile_context, harris_score_kernel_name.str(), build_opts);
-
- // Set static kernel arguments
- unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, sensitivity);
- _kernel.setArg(idx++, strength_thresh);
- _kernel.setArg(idx++, norm_factor);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
- constexpr unsigned int num_elems_written_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration = block_size == 7 ? 10 : 8;
- const unsigned int num_rows_read_per_iteration = block_size;
-
- Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowRectangle input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input1_access, input2_access, output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
- output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = harris_score_kernel_name.str();
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input2->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(1));
-}
-
-void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
deleted file mode 100644
index 5c0eb2a606..0000000000
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
-{
- num_elems_processed_per_iteration = 4;
- // The window needs to be based on input as we copy all the heights of input
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, height_offset, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- Window win_collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
-}
-Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
- }
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- return Status{};
-}
-} // namespace
-
-CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _height_offset(0), _num_elems_processed_per_iteration()
-{
-}
-
-Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
-{
- unsigned int num_elems_processed_per_iteration;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), height_offset, output->clone().get(), num_elems_processed_per_iteration).first);
- return Status{};
-}
-
-void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, height_offset, output);
-}
-
-void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
-
- _input = input;
- _output = output;
- _height_offset = height_offset;
-
- auto win_config = validate_and_configure_window(input->info(), height_offset, output->info(), _num_elems_processed_per_iteration);
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
- build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
- build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
- // Configure kernel window
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-}
-
-void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, window);
- add_4D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
deleted file mode 100644
index f16fa8c9fb..0000000000
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <cstring>
-#include <string>
-
-using namespace arm_compute;
-
-// each thread handle 16 pixels
-constexpr signed int pixels_per_item = 16;
-
-// local work group size in X dimension
-constexpr unsigned int local_x_size = 16;
-
-CLHistogramKernel::CLHistogramKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
-
- // Check input size
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
- // Check offset
- ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
- // Check range
- ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
- _input = input;
- _output = output;
-
- if(_input->info()->dimension(0) < pixels_per_item)
- {
- return;
- }
-
- unsigned int num_bins = _output->num_bins();
- unsigned int window_size = _output->window();
- unsigned int offset = _output->offset();
- unsigned int range = _output->range();
- unsigned int offrange = offset + range;
- unsigned int bin_size = _output->size();
- unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
-
- // Create kernel
- bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
- const std::string kernel_name = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, buffer_size, nullptr);
- _kernel.setArg(idx++, _output->cl_buffer());
- if(!is_fixed_size)
- {
- _kernel.setArg<cl_uint>(idx++, num_bins);
- _kernel.setArg<cl_uint>(idx++, offset);
- _kernel.setArg<cl_uint>(idx++, range);
- _kernel.setArg<cl_uint>(idx++, offrange);
- }
-
- // We only run histogram on Image, therefore only 2 dimensions here
- unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, end_position, pixels_per_item));
- win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
-
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- // TODO (COMPMID-679): Add CLMemFill
- _output->map(queue, true);
- ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
- memset(_output->buffer(), 0, _output->size());
- _output->unmap(queue);
-
- if(_input->info()->dimension(0) < pixels_per_item)
- {
- return;
- }
-
- Window slice = window.first_slice_window_2D();
- const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
- cl::NDRange lws = (local_x_size < gws_x) ? cl::NDRange(local_x_size, 1) : cl::NDRange(1, 1);
-
- do
- {
- /* Run the core part which has width can be divided by 16 */
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
-
- enqueue(queue, *this, slice, lws);
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLHistogramBorderKernel::CLHistogramBorderKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogramBorderKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
-
- // Check input size
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
- // Check offset
- ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
-
- // Check range
- ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
-
- // We only run histogram on Image, therefore only 2 dimensions here
- unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
-
- if(start_position >= input->info()->dimension(0))
- {
- return; // no need to run histogram border kernel
- }
-
- _input = input;
- _output = output;
-
- unsigned int num_bins = _output->num_bins();
- unsigned int window_size = _output->window();
- unsigned int offset = _output->offset();
- unsigned int range = _output->range();
- unsigned int offrange = offset + range;
-
- // Create kernel
- bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
- const std::string kernel_name = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, _output->cl_buffer());
- if(!is_fixed_size)
- {
- _kernel.setArg<cl_uint>(idx++, num_bins);
- _kernel.setArg<cl_uint>(idx++, offset);
- _kernel.setArg<cl_uint>(idx++, range);
- _kernel.setArg<cl_uint>(idx++, offrange);
- }
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
- win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- if(window.x().start() >= window.x().end())
- {
- return;
- }
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- cl::NDRange lws = cl::NDRange(1, 1);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- /* Run the border part which has width cannot be divided by 16 */
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
-
- enqueue(queue, *this, slice, lws);
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
deleted file mode 100644
index 078aad2356..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-struct Im2ColConfiguration
-{
- std::string kernel_name{};
- std::set<std::string> build_options{};
- unsigned int num_elems_processed_per_iteration{};
- bool is_padding_required_nchw{};
-};
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
-
- if(output->total_size() > 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-
- const DataLayout data_layout = input->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int input_width = input->dimension(width_idx);
- const unsigned int input_height = input->dimension(height_idx);
-
- // Configure the execute window based on the selected optimal OpenCL kernel
- bool window_changed = false;
- Window win;
-
- if(data_layout == DataLayout::NHWC)
- {
- win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- const int xin_start = 0;
- const int xin_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration) : input->dimension(0);
- const int yin_start = 0;
- const int yin_end = input->dimension(1);
-
- const int xout_start = 0;
- const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? output->dimension(0) + (num_elems_processed_per_iteration - input->dimension(0)) : output->dimension(0);
- const int yout_start = 0;
- const int yout_end = output->dimension(1);
-
- AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end);
- AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end);
- window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
- }
- else
- {
- if(is_padding_required_nchw)
- {
- const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
- win = calculate_max_window(*input,
- Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
- AccessWindowStatic input_access(input,
- -border.left,
- -border.top,
- ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
- input_height + border.bottom);
- window_changed = window_changed || update_window_and_padding(win, input_access);
- }
- else
- {
- // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
- // update_window_and_padding() can be skipped
- win = calculate_max_window(*input, Steps());
- }
- }
-
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
- win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int input_width = input->dimension(width_idx);
- const unsigned int input_height = input->dimension(height_idx);
- const unsigned int input_channel = input->dimension(channel_idx);
-
- const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
- // Im2Col configuration
- std::string kernel_name = "im2col_generic_";
- CLBuildOptions build_opts;
- unsigned int num_elems_processed_per_iteration = 1;
- bool is_padding_required_nchw = false;
- const UniformQuantizationInfo qinfo = input->quantization_info().uniform();
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
- build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
- build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
- build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
- build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
- build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
- build_opts.add_option_if(has_bias, "-DHAS_BIAS");
-
- if(data_layout == DataLayout::NHWC)
- {
- num_elems_processed_per_iteration = 2;
- is_padding_required_nchw = false;
-
- // Only the 3x3 and 9x9 cases are optimized for NHWC
- if(kernel_dims == Size2D(3U, 3U))
- {
- kernel_name = "im2col3x3_";
- }
- else if(kernel_dims == Size2D(9U, 9U))
- {
- kernel_name = "im2col9x9_";
- }
-
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));
- }
- else
- {
- if(dilation == Size2D(1U, 1U))
- {
- const bool squared_im2col = kernel_dims.width == kernel_dims.height;
- if(squared_im2col)
- {
- // Check if we can run an optimized im2col for NCHW
- switch(kernel_dims.width)
- {
- case 1:
- // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
- if(conv_info.stride().first == 1 && !conv_info.has_padding())
- {
- kernel_name = "im2col1x1_stridex1_";
- num_elems_processed_per_iteration = 4;
- is_padding_required_nchw = true;
- }
- break;
- case 3:
- kernel_name = "im2col3x3_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- break;
- case 5:
- kernel_name = "im2col5x5_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- break;
- case 11:
- // Optimized im2col11x11 if pad_x = pad_y = 0
- if(!conv_info.has_padding())
- {
- kernel_name = "im2col11x11_padx0_pady0_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = true;
- }
- break;
- default:
- kernel_name = "im2col_generic_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = false;
- break;
- }
- }
- else if(kernel_dims.width > 1 && !conv_info.has_padding())
- {
- kernel_name = "im2col_generic_padx0_pady0_";
- num_elems_processed_per_iteration = 1;
- is_padding_required_nchw = false;
-
- // Optimized im2col is performed using one or more vector operations with the specified vector size
- // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
- // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
- // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
- // Using the vector size of 8, however, may be faster.
- // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
- // is used instead.)
- const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width);
- const size_t width_mod_vector_size = kernel_dims.width % vector_size;
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
- }
- }
- }
-
- // Append the data layout to the kernel_name
- kernel_name += lower_string(string_from_data_layout(data_layout));
-
- Im2ColConfiguration im2col_config;
- im2col_config.kernel_name = kernel_name;
- im2col_config.build_options = build_opts.options();
- im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
- im2col_config.is_padding_required_nchw = is_padding_required_nchw;
-
- return im2col_config;
-}
-} // namespace
-
-CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
-{
-}
-
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
-}
-
-void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
- const Size2D &dilation,
- unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
-
- _data_layout = input->info()->data_layout();
-
- const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int input_width = input->info()->dimension(width_idx);
- const unsigned int input_height = input->info()->dimension(height_idx);
-
- // Select and configure the optimal OpenCL kernel to run.
- // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
- // and the padding requirement flag
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
-
- // Create kernel
- _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
-
- _input = input;
- _output = output;
- _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
- _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
- _kernel_dims = kernel_dims; // Only needed by the Tuner
- _conv_info = conv_info; // Only needed by the Tuner
- _num_groups = num_groups;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw, num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = im2col_config.kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(num_groups);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
- Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
- im2col_config.is_padding_required_nchw, num_groups)
- .first);
- return Status{};
-}
-
-void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- // Get initial windows
- // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- window_collapsed.set_dimension_step(Window::DimZ, 1);
-
- Window window_output;
- window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-
- const Window first_slice_3d = window_collapsed.first_slice_window_3D();
-
- Window slice = first_slice_3d;
- Window slice_in = first_slice_3d;
- Window slice_out = window_output.first_slice_window_2D();
-
- if(_data_layout == DataLayout::NHWC)
- {
- const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
- const int num_batches = tmp_win[3].end();
-
- slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
- slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
- }
- else
- {
- slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
- slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
- // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
- }
-
- // Setup input slice
- // The dimensions of the input are increased within the OpenCL kernel
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Setup output slice
- // The dimensions of the output are increased within the OpenCL kernel
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- if(_num_groups == 1)
- {
- add_2D_tensor_argument(idx, _output, slice_out);
- }
- else
- {
- add_3D_tensor_argument(idx, _output, slice_out);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 0eb2c50e6f..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,71 +21,162 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *output)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+ if (output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
+{
+ _type = CLKernelType::ELEMENTWISE;
+}
+
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ bool use_mixed_precision)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ auto padding_info = get_padding_info({input, output});
+
+ _input = input;
+ _output = output == nullptr ? input : output;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_meanvar(_input->info(), _output->info()));
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+ (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+ // Create kernel
+ _kernel = create_kernel(compile_context, "compute_mean_var", build_opts.options());
+
// We handle the planes manually
- Window win = calculate_max_window(*input, Steps(1));
+ Window win = calculate_max_window(*(input->info()), Steps(1));
+ const auto data_layout = input->info()->data_layout();
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int batches_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ const unsigned int input_channel = input->info()->dimension(channel_idx);
+ const unsigned int input_batches = input->info()->dimension(batches_idx);
+ const TensorShape out_shape(input_channel, 2u, input_batches);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+ if (use_mixed_precision)
+ {
+ auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
+ }
+ else
+ {
+ auto_init_if_empty(*_output->info(), out_shape, 1, input->info()->data_type());
+ }
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
- // CLInstanceNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- return std::make_pair(Status{}, win);
+Status CLComputeMeanVariance::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_meanvar(input, output));
+ return Status{};
}
-} // namespace
-CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
+void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window collapsed_window = window.collapse(window, Window::DimZ);
+
+ // We will process the planes together
+ if (_input->info()->data_layout() == DataLayout::NCHW)
+ {
+ collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ }
+ else
+ {
+ collapsed_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(3), 1));
+ }
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, collapsed_window);
+ add_3D_tensor_argument(idx, _output, collapsed_window);
+
+ enqueue(queue, *this, collapsed_window, lws_hint());
}
-void CLInstanceNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _run_in_place(false)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output == nullptr ? input : output;
+ _mean = mean_var;
_run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), info));
@@ -93,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+ ? "float"
+ : get_cl_type_from_data_type(input->info()->data_type())));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -108,15 +201,21 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
_kernel = create_kernel(compile_context, "instance_normalization", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- ICLKernel::configure_internal(std::get<1>(win_config));
+ Window win = calculate_max_window(*input->info(), Steps(1));
+ if (output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
+ }
+
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
@@ -128,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
Window collapsed_window = window.collapse(window, Window::DimZ);
// We will process the planes together
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if (_input->info()->data_layout() == DataLayout::NCHW)
{
collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -141,7 +240,9 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, collapsed_window);
- if(!_run_in_place)
+ add_3D_tensor_argument(idx, _mean, collapsed_window);
+
+ if (!_run_in_place)
{
add_4D_tensor_argument(idx, _output, collapsed_window);
}
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
new file mode 100644
index 0000000000..9f436da7f6
--- /dev/null
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Interface for performing an instance normalization */
+class CLInstanceNormalizationLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLInstanceNormalizationLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLInstanceNormalizationLayerKernel(const CLInstanceNormalizationLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLInstanceNormalizationLayerKernel &operator=(const CLInstanceNormalizationLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLInstanceNormalizationLayerKernel(CLInstanceNormalizationLayerKernel &&) = default;
+ /** Default move assignment operator */
+ CLInstanceNormalizationLayerKernel &operator=(CLInstanceNormalizationLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLInstanceNormalizationLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+ * In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+ * @param[in] mean_var Tensor containing the precomputed mean and variance values. Data types supported: F32.
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+ * @param[in] info Kernel meta-data descriptor
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *mean_var,
+ ICLTensor *output,
+ const InstanceNormalizationLayerKernelInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+ * @param[in] info Kernel meta-data descriptor
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ ICLTensor *_mean;
+ bool _run_in_place;
+};
+
+/** Interface for compute Mean and Variance per channel */
+class CLComputeMeanVariance : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLComputeMeanVariance();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeMeanVariance(const CLComputeMeanVariance &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLComputeMeanVariance &operator=(const CLComputeMeanVariance &) = delete;
+ /** Default Move Constructor. */
+ CLComputeMeanVariance(CLComputeMeanVariance &&) = default;
+ /** Default move assignment operator */
+ CLComputeMeanVariance &operator=(CLComputeMeanVariance &&) = default;
+ /** Default destructor */
+ ~CLComputeMeanVariance() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
+ * In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+ * @param[in] use_mixed_precision Use mixed precision in case of FP16 execution
+ */
+ void
+ configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
deleted file mode 100644
index 4c3445d1ae..0000000000
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImageHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
-
- _input = input;
- _output = output;
-
- // Create kernel
- const std::string kernel_name = std::string("integral_horizontal");
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Configure kernel window
- const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
- const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16);
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
- output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-CLIntegralImageVertKernel::CLIntegralImageVertKernel()
- : _in_out(nullptr)
-{
-}
-
-void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
-{
- configure(CLKernelLibrary::get().get_compile_context(), in_out);
-}
-
-void CLIntegralImageVertKernel::configure(const CLCompileContext &compile_context, ICLTensor *in_out)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
-
- _in_out = in_out;
-
- // Create kernel
- const std::string kernel_name = std::string("integral_vertical");
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration_x = 8;
- const unsigned int num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
-
- Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
- update_window_and_padding(win, in_out_access);
-
- in_out_access.set_valid_region(win, in_out->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(in_out->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(in_out->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(in_out->info()->dimension(1));
-}
-
-void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const size_t height = _in_out->info()->dimension(1);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _in_out, slice);
- _kernel.setArg<cl_uint>(idx++, height);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index e04950d0a2..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -41,9 +43,8 @@ namespace
{
constexpr int max_input_tensor_dim = 3;
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+ "Actual normalization axis greater than max number of dimensions");
// Reduce shape on axis
TensorShape sum_shape = input->tensor_shape();
sum_shape.set(actual_axis, 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -70,40 +72,30 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
return Status{};
}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
- return std::make_tuple(err, win);
-}
} // namespace
CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
: _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+ const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
{
configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
}
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
+ auto padding_info = get_padding_info({input, sum, output});
_input = input;
_sum = sum;
@@ -111,35 +103,40 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
_actual_axis = wrap_around(axis, max_input_tensor_dim);
_epsilon = epsilon;
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
// Create kernel
std::string kernel_name;
unsigned int idx = 0;
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
- kernel_name = "x";
+ kernel_name = "l2_normalize_x";
idx = num_arguments_per_2D_tensor() * 3;
break;
case 1:
- kernel_name = "y";
+ kernel_name = "l2_normalize_y";
idx = num_arguments_per_2D_tensor() * 3;
break;
case 2:
- kernel_name = "z";
+ kernel_name = "l2_normalize_z";
idx = num_arguments_per_3D_tensor() * 3;
break;
default:
ARM_COMPUTE_ERROR("Axis not supported");
}
- _kernel = create_kernel(compile_context, "l2_normalize_" + kernel_name, build_opts);
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set epsilon argument
- if(input->info()->data_type() == DataType::F32)
+ if (input->info()->data_type() == DataType::F32)
{
_kernel.setArg<cl_float>(idx, _epsilon);
}
@@ -149,17 +146,19 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
}
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ Window win = calculate_max_window(*input->info(), Steps(vec_size_x));
- ICLKernel::configure_internal(std::get<1>(win_config));
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
+
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
return Status{};
}
@@ -170,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
Window window_sum(window);
- switch(_actual_axis)
+ switch (_actual_axis)
{
case 0:
{
@@ -184,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 1:
@@ -200,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 2:
@@ -216,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
add_3D_tensor_argument(idx, _sum, sum_slice);
add_3D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+ } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
}
break;
default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
new file mode 100644
index 0000000000..5c9ab94ce5
--- /dev/null
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
+#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
+class CLL2NormalizeLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLL2NormalizeLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLL2NormalizeLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] sum Sum values tensor. Data types supported: same as @p input.
+ * Sum will have the same number of dimensions as input.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+ * @param[in] epsilon Lower bound value for the normalization.
+ */
+ void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] sum Sum values tensor. Data types supported: same as @p input.
+ * Sum will have the same number of dimensions as input.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+ * @param[in] epsilon Lower bound value for the normalization.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *sum,
+ ICLTensor *output,
+ int axis,
+ float epsilon);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] sum Sum values tensor info. Data types supported: same as @p input.
+ * Sum will have the same number of dimensions as input.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
+ * @param[in] epsilon Lower bound value for the normalization.
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_sum;
+ ICLTensor *_output;
+ unsigned int _actual_axis;
+ float _epsilon;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
deleted file mode 100644
index a2948d38fe..0000000000
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-{
- configure(CLKernelLibrary::get().get_compile_context(), old_points, new_points_estimates, old_points_internal, new_points_internal, use_initial_estimate, level, num_levels, pyramid_scale);
-}
-
-void CLLKTrackerInitKernel::configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
-
-{
- ARM_COMPUTE_ERROR_ON(old_points == nullptr);
- ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
-
- const float scale = std::pow(pyramid_scale, level);
-
- // Create kernel
- std::string kernel_name = "init_level";
- if(level == (num_levels - 1))
- {
- kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
- }
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set static kernel arguments
- unsigned int idx = 0;
- if(level == (num_levels - 1))
- {
- _kernel.setArg(idx++, old_points->cl_buffer());
- if(use_initial_estimate)
- {
- _kernel.setArg(idx++, new_points_estimates->cl_buffer());
- }
- }
- _kernel.setArg(idx++, old_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg<cl_float>(idx++, scale);
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
- ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-{
- configure(CLKernelLibrary::get().get_compile_context(), new_points_internal, new_points);
-}
-
-void CLLKTrackerFinalizeKernel::configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
-
-{
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points == nullptr);
-
- // Create kernel
- _kernel = create_kernel(compile_context, "finalize");
-
- // Set static kernel arguments
- unsigned int idx = 0;
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points->cl_buffer());
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
- ICLKernel::configure_internal(window);
-}
-
-void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
- : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
-{
-}
-
-void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level)
-{
- configure(CLKernelLibrary::get().get_compile_context(), old_input, old_scharr_gx, old_scharr_gy, old_points_internal, new_points_internal, coeff_table, old_ival, window_dimension, level);
-}
-
-void CLLKTrackerStage0Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
- ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
- ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- size_t window_dimension, size_t level)
-
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
- ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
- _old_input = old_input;
- _old_scharr_gx = old_scharr_gx;
- _old_scharr_gy = old_scharr_gy;
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const ValidRegion valid_region = intersect_valid_regions(
- old_input->info()->valid_region(),
- old_scharr_gx->info()->valid_region(),
- old_scharr_gy->info()->valid_region());
-
- update_window_and_padding(window,
- AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)),
- AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)),
- AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)));
-
- ICLKernel::configure_internal(window);
-
- // Initialize required variables
- const int level0 = (level == 0) ? 1 : 0;
- const int window_size = window_dimension;
- const int window_size_squared = window_dimension * window_dimension;
- const int window_size_half = window_dimension / 2;
- const float eig_const = 1.0f / (2.0f * window_size_squared);
- const cl_float3 border_limits =
- {
- {
- // -1 because we load 2 values at once for bilinear interpolation
- static_cast<cl_float>(valid_region.end(0) - window_size - 1),
- static_cast<cl_float>(valid_region.end(1) - window_size - 1),
- static_cast<cl_float>(valid_region.start(0))
- }
- };
-
- // Create kernel
- _kernel = create_kernel(compile_context, "lktracker_stage0");
-
- // Set arguments
- unsigned int idx = 3 * num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, old_points_internal->cl_buffer());
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, coeff_table->cl_buffer());
- _kernel.setArg(idx++, old_ival->cl_buffer());
- _kernel.setArg<cl_int>(idx++, window_size);
- _kernel.setArg<cl_int>(idx++, window_size_squared);
- _kernel.setArg<cl_int>(idx++, window_size_half);
- _kernel.setArg<cl_float3>(idx++, border_limits);
- _kernel.setArg<cl_float>(idx++, eig_const);
- _kernel.setArg<cl_int>(idx++, level0);
-}
-
-void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Set static tensor arguments. Setting here as allocation might be deferred.
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _old_input, window);
- add_2D_tensor_argument(idx, _old_scharr_gx, window);
- add_2D_tensor_argument(idx, _old_scharr_gy, window);
-
- enqueue(queue, *this, window, lws_hint());
-}
-
-CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
- : _new_input(nullptr)
-{
-}
-
-void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-{
- configure(CLKernelLibrary::get().get_compile_context(), new_input, new_points_internal, coeff_table, old_ival, termination, epsilon, num_iterations, window_dimension, level);
-}
-
-void CLLKTrackerStage1Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table,
- ICLOldValArray *old_ival,
- Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
-
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
- ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
- ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
-
- _new_input = new_input;
-
- // Configure kernel window
- Window window;
- window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
- window.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- const ValidRegion &valid_region = new_input->info()->valid_region();
-
- update_window_and_padding(window,
- AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
- valid_region.end(0), valid_region.end(1)));
-
- ICLKernel::configure_internal(window);
-
- // Initialize required variables
- const int level0 = (level == 0) ? 1 : 0;
- const int window_size = window_dimension;
- const int window_size_squared = window_dimension * window_dimension;
- const int window_size_half = window_dimension / 2;
- const float eig_const = 1.0f / (2.0f * window_size_squared);
- const cl_float3 border_limits =
- {
- {
- // -1 because we load 2 values at once for bilinear interpolation
- static_cast<cl_float>(valid_region.end(0) - window_size - 1),
- static_cast<cl_float>(valid_region.end(1) - window_size - 1),
- static_cast<cl_float>(valid_region.start(0))
- }
- };
-
- // Set maximum number of iterations used for convergence
- const size_t max_iterations = 1000;
- num_iterations = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
-
- const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
-
- // Create kernel
- _kernel = create_kernel(compile_context, "lktracker_stage1");
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, new_points_internal->cl_buffer());
- _kernel.setArg(idx++, coeff_table->cl_buffer());
- _kernel.setArg(idx++, old_ival->cl_buffer());
- _kernel.setArg<cl_int>(idx++, window_size);
- _kernel.setArg<cl_int>(idx++, window_size_squared);
- _kernel.setArg<cl_int>(idx++, window_size_half);
- _kernel.setArg<cl_int>(idx++, num_iterations);
- _kernel.setArg<cl_float>(idx++, epsilon);
- _kernel.setArg<cl_float3>(idx++, border_limits);
- _kernel.setArg<cl_float>(idx++, eig_const);
- _kernel.setArg<cl_int>(idx++, level0);
- _kernel.setArg<cl_int>(idx++, term_epsilon);
-}
-
-void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Set static tensor arguments. Setting here as allocation might be deferred.
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _new_input, window);
-
- enqueue(queue, *this, window, lws_hint());
-}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
deleted file mode 100644
index 04ad754cbf..0000000000
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
-{
-}
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
- const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->data_type());
-
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
-
- AccessWindowHorizontal input0_access(input0, 0, num_elems_processed_per_iteration_x);
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_x);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
-
- bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
- return std::make_tuple(err, win);
-}
-} // namespace
-
-void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output);
-}
-
-void CLLocallyConnectedMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
-
- cl::NDRange lws_hint;
- if(output->info()->dimension(1) == 196)
- {
- lws_hint = cl::NDRange(1, 7);
- }
- else
- {
- lws_hint = cl::NDRange(8, 8);
- }
-
- std::ostringstream mm_arguments;
- std::set<std::string> build_opts;
-
- mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
- build_opts.emplace(mm_arguments.str());
-
- // Create kernel
- std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
- _kernel = create_kernel(compile_context, ("gemm_lc_vm_" + data_type_name), build_opts);
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
-}
-
-Status CLLocallyConnectedMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get())));
-
- return Status{};
-}
-
-void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- Window matrix_b_window;
- matrix_b_window.use_tensor_dimensions(_input1->info()->tensor_shape());
- Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_3D_tensor_argument(idx, _input1, slice_matrix_b);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
deleted file mode 100644
index 88c10342f4..0000000000
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
- : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
-{
-}
-
-void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
- MagnitudeType mag_type, PhaseType phase_type)
-{
- configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, mag_type, phase_type);
-}
-
-void CLMagnitudePhaseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
- MagnitudeType mag_type, PhaseType phase_type)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-
- _run_mag = (magnitude != nullptr);
- _run_phase = (phase != nullptr);
- if(_run_mag)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
- }
- if(_run_phase)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- }
-
- if(!_run_mag && !_run_phase)
- {
- ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
- }
-
- _gx = gx;
- _gy = gy;
- _magnitude = magnitude;
- _phase = phase;
-
- // Construct kernel name
- std::set<std::string> build_opts = {};
-
- // Add magnitude type
- if(_run_mag)
- {
- switch(mag_type)
- {
- case MagnitudeType::L1NORM:
- build_opts.insert("-DMAGNITUDE=1");
- break;
- case MagnitudeType::L2NORM:
- build_opts.insert("-DMAGNITUDE=2");
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
- build_opts.insert("-DMAGNITUDE=0");
- break;
- }
- }
-
- // Add phase type
- if(_run_phase)
- {
- switch(phase_type)
- {
- case PhaseType::UNSIGNED:
- build_opts.insert("-DPHASE=1");
- break;
- case PhaseType::SIGNED:
- build_opts.insert("-DPHASE=2");
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
- build_opts.insert("-DPHASE=0");
- break;
- }
- }
-
- // Add data_type
- build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
-
- // Create kernel
- const std::string kernel_name = std::string("magnitude_phase");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- gx_access, gy_access,
- output_magnitude_access, output_phase_access);
-
- ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
- gy->info()->valid_region());
- output_magnitude_access.set_valid_region(win, valid_region);
- output_phase_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(gx->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gx->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(gx->info()->dimension(1));
-}
-
-void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _gx, slice);
- add_2D_tensor_argument(idx, _gy, slice);
- add_2D_tensor_argument_if((_run_mag), idx, _magnitude, slice);
- add_2D_tensor_argument_if((_run_phase), idx, _phase, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
new file mode 100644
index 0000000000..e560f1de4a
--- /dev/null
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
+
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ PoolingType pool_type = pool_info.pool_type;
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+ std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+ const int pool_size_x = pool_info.pool_size.width;
+ const int pool_size_y = pool_info.pool_size.height;
+ const Size2D pool_size(pool_size_x, pool_size_y);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
+{
+ _type = CLKernelType::POOL;
+}
+
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
+ auto padding_info = get_padding_info({input, indices, output});
+
+ _input = input;
+ _output = output;
+ _indices = indices;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+ build_opts.add_option("-DWIDTH_DST=" + support::cpp11::to_string(output->info()->dimension(0)));
+ build_opts.add_option("-DHEIGHT_DST=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option("-DDEPTH_DST=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+ const std::string kernel_name("max_unpooling_layer_2");
+
+ // Create kernel
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ auto window = calculate_max_window(*input->info(), Steps());
+ ICLKernel::configure_internal(window);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
+ return Status{};
+}
+
+void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _indices, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000000..eb18a46784
--- /dev/null
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLMaxUnpoolingLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLMaxUnpoolingLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMaxUnpoolingLayerKernel(const CLMaxUnpoolingLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMaxUnpoolingLayerKernel &operator=(const CLMaxUnpoolingLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLMaxUnpoolingLayerKernel(CLMaxUnpoolingLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLMaxUnpoolingLayerKernel &operator=(CLMaxUnpoolingLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLMaxUnpoolingLayerKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @note Output shape must be equal to the shape of the original input to pool.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] indices Tensor containing the offset to store the input elements in the output tensor.
+ * @ref CLPoolingLayer with indices should precede this function in order to
+ * properly reconstruct the output tensor.
+ * The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+ * @param[in] indices TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
+ * @ref CLPoolingLayer with indices should precede this function in order to
+ * properly reconstruct the output tensor.
+ * The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info);
+
+ // Inherited methods overridden
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_indices;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
deleted file mode 100644
index de8b57ef17..0000000000
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLMeanStdDevKernel::CLMeanStdDevKernel()
- : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0)
-{
-}
-
-BorderSize CLMeanStdDevKernel::border_size() const
-{
- return _border_size;
-}
-
-Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
- ARM_COMPUTE_UNUSED(mean);
- ARM_COMPUTE_UNUSED(stddev);
- ARM_COMPUTE_UNUSED(global_sum);
- ARM_COMPUTE_UNUSED(global_sum_squared);
- ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED();
- ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
- return Status{};
-}
-
-void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, mean, global_sum, stddev, global_sum_squared);
-}
-
-void CLMeanStdDevKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum);
- ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
- ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared));
-
- _input = input;
- _mean = mean;
- _stddev = stddev;
- _global_sum = global_sum;
- _global_sum_squared = global_sum_squared;
-
- // Create kernel
- std::set<std::string> build_opts;
-
- if(_stddev != nullptr)
- {
- build_opts.insert("-DSTDDEV");
- }
-
- _kernel = create_kernel(compile_context, "mean_stddev_accumulate", build_opts);
-
- // Set fixed arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
-
- _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
- _kernel.setArg(idx++, *_global_sum);
-
- if(_stddev != nullptr)
- {
- _kernel.setArg(idx++, *_global_sum_squared);
- }
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration_x = 8;
- const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1);
-
- _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- update_window_and_padding(win, input_access);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Clear sums
- static const cl_ulong zero = 0;
- queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
-
- if(_stddev != nullptr)
- {
- queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
- }
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- // Set slice step equal to height to force gws[1] to 1,
- // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
- slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-
- // Calculate mean and stddev
- cl_ulong global_sum = 0;
- cl_ulong global_sum_squared = 0;
- const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
-
- queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
- const float mean = global_sum / num_pixels;
- *_mean = mean;
-
- if(_stddev != nullptr)
- {
- queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
- *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
- }
-}
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 4230570ae0..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -48,39 +50,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- if(output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- // This kernel doesn't need padding
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- if(output != nullptr)
- {
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- return std::make_pair(Status{}, win);
-}
} // namespace
CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
: _input(nullptr), _output(nullptr), _run_in_place(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *output, float epsilon)
@@ -88,18 +70,28 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
}
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
_run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+ ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+ input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+
+ if (output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), *input->info());
+ }
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
@@ -107,15 +99,15 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DMEANSTDNORM_HALF");
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
// Create kernel
_kernel = create_kernel(compile_context, "mean_stddev_normalization", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
_config_id = "mean_stddev_normalization_layer_";
@@ -129,7 +121,6 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
Status CLMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
return Status{};
}
@@ -149,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
new file mode 100644
index 0000000000..e02a3c58a3
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
+class CLMeanStdDevNormalizationKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLMeanStdDevNormalizationKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMeanStdDevNormalizationKernel(const CLMeanStdDevNormalizationKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMeanStdDevNormalizationKernel &operator=(const CLMeanStdDevNormalizationKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLMeanStdDevNormalizationKernel(CLMeanStdDevNormalizationKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLMeanStdDevNormalizationKernel &operator=(CLMeanStdDevNormalizationKernel &&) = default;
+ /** Default destructor */
+ ~CLMeanStdDevNormalizationKernel() = default;
+ /** Initialise the kernel's input and outputs.
+ *
+ * @note If the output tensor is a nullptr, the normalization will be performed in-place.
+ *
+ * @param[in, out] input Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
+ * this tensor will store the result of the normalization. Data types supported: F16/F32.
+ * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+ * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+ */
+ void configure(ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+ /** Initialise the kernel's input and outputs.
+ *
+ * @note If the output tensor is a nullptr, the normalization will be performed in-place.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] input Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
+ * this tensor will store the result of the normalization. Data types supported: F16/F32.
+ * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+ * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output = nullptr,
+ float epsilon = 1e-8f);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
+ *
+ * @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
+ * this tensor will store the result of the normalization. Data types supported: F16/F32.
+ * @param[in] output (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
+ * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
deleted file mode 100644
index 3b1b6ada03..0000000000
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-BorderSize CLMedian3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLMedian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- _input = input;
- _output = output;
-
- // Create kernel
- const std::string kernel_name = std::string("non_linear_filter_box3x3");
- _kernel = create_kernel(compile_context, kernel_name, { "-DMEDIAN" });
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
deleted file mode 100644
index 992be0a10a..0000000000
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-CLMemsetKernel::CLMemsetKernel()
- : ICLKernel(), _tensor(nullptr), _full_window()
-{
-}
-
-void CLMemsetKernel::configure(ICLTensor *tensor,
- const PixelValue &constant_value,
- Window *window)
-{
- configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
-}
-
-void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor,
- const PixelValue &constant_value,
- Window *window)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
- ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
-
- _tensor = tensor;
-
- const DataType data_type = tensor->info()->data_type();
- const int vec_size_x = 16 / tensor->info()->element_size();
-
- // Create and update the window (if needed)
- _full_window = calculate_max_window(*tensor->info());
- Window win = _full_window;
- if(window != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
- win = *window;
- }
-
- const int output_width_x = win.num_iterations(0);
- const bool multi_access_x = output_width_x >= vec_size_x;
- const bool remainder_x = output_width_x % vec_size_x > 0;
-
- if(multi_access_x)
- {
- win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
- ICLKernel::configure_internal(win);
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
- _kernel = create_kernel(compile_context, "memset", build_opts.options());
-}
-
-Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
-{
- ARM_COMPUTE_UNUSED(tensor);
- ARM_COMPUTE_UNUSED(constant_value);
- if(window != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
- }
- return Status{};
-}
-
-void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Collapse all the batches on the third
- Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _tensor, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
deleted file mode 100644
index 2ff9196f13..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
- if(output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- TensorShape output_shape = compute_min_max_shape(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- TensorShape output_shape = compute_min_max_shape(input);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
- const unsigned int num_elems_processed_per_iteration = 1;
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output, 0, 0, 2, output->dimension(1));
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
-}
-} // namespace
-
-CLMinMaxLayerKernel::CLMinMaxLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- std::set<std::string> build_opts;
- build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // Create kernel
- _kernel = create_kernel(compile_context, "minmax_layer", build_opts);
-
- auto win_config = validate_and_configure_window(input->info(), output->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
- return Status{};
-}
-
-void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
-{
- _output->map(queue, true);
-
- Window window_output;
- window_output.use_tensor_dimensions(_output->info()->tensor_shape());
- window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator output(_output, window_output);
-
- // Reset output
- execute_window_loop(window_output, [&](const Coordinates &)
- {
- auto *ptr = reinterpret_cast<float *>(output.ptr());
- ptr[0] = std::numeric_limits<float>::max();
- ptr[1] = std::numeric_limits<float>::min();
- },
- output);
-
- _output->unmap(queue);
-}
-
-void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = window_collapsed.first_slice_window_3D();
- slice.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice.set(Window::DimY, Window::Dimension(0, 1, 1));
- slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- do
- {
- Window output_slice = slice.shift_dimensions(2);
-
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_1D_tensor_argument(idx, _output, output_slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
deleted file mode 100644
index dfa0555331..0000000000
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-namespace arm_compute
-{
-inline int32_t FloatFlip(float val)
-{
- static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
- int32_t int_val = 0;
-
- memcpy(&int_val, &val, sizeof(float));
- int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF;
- return int_val;
-}
-
-inline float IFloatFlip(int32_t val)
-{
- static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
- float flt_val = 0.f;
-
- val = (val >= 0) ? val : val ^ 0x7FFFFFFF;
- memcpy(&flt_val, &val, sizeof(float));
- return flt_val;
-}
-
-CLMinMaxKernel::CLMinMaxKernel()
- : _input(nullptr), _min_max(), _data_type_max_min()
-{
-}
-
-void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, min_max);
-}
-
-void CLMinMaxKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON(min_max == nullptr);
-
- _input = input;
- _min_max = min_max;
- const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
-
- switch(input->info()->data_type())
- {
- case DataType::U8:
- _data_type_max_min[0] = UCHAR_MAX;
- _data_type_max_min[1] = 0;
- break;
- case DataType::S16:
- _data_type_max_min[0] = SHRT_MAX;
- _data_type_max_min[1] = SHRT_MIN;
- break;
- case DataType::F32:
- _data_type_max_min[0] = FloatFlip(std::numeric_limits<float>::max());
- _data_type_max_min[1] = FloatFlip(std::numeric_limits<float>::lowest());
- break;
- default:
- ARM_COMPUTE_ERROR("You called with the wrong image data types");
- }
-
- // Set kernel build options
- std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
-
- if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
- {
- build_opts.emplace("-DNON_MULTIPLE_OF_16");
- }
-
- if(input->info()->data_type() == DataType::F32)
- {
- build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
- build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits<float>::lowest()));
- build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
- }
- else
- {
- build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0]));
- build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1]));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "minmax", build_opts);
-
- // Set fixed arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, *_min_max);
- _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
- ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Reset mininum and maximum values
- queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-
- cl_int min = 0;
- cl_int max = 0;
- queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&min));
- queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&max));
-
- if(_input->info()->data_type() == DataType::F32)
- {
- std::array<float, 2> min_max =
- {
- {
- IFloatFlip(min),
- IFloatFlip(max)
- }
- };
- queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data());
- }
- else
- {
- std::array<int32_t, 2> min_max = { { min, max } };
- queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data());
- }
-}
-
-CLMinMaxLocationKernel::CLMinMaxLocationKernel()
- : _input(nullptr), _min_max_count(nullptr)
-{
-}
-
-void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, min_max, min_max_count, min_loc, max_loc);
-}
-
-void CLMinMaxLocationKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc,
- ICLCoordinates2DArray *max_loc)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON(min_max == nullptr);
- ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
-
- _input = input;
- _min_max_count = min_max_count;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
- build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
- build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
- if(input->info()->data_type() == DataType::F32)
- {
- build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "minmaxloc", build_opts);
-
- // Set static arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, *min_max);
- _kernel.setArg(idx++, *min_max_count);
- if(min_loc != nullptr)
- {
- _kernel.setArg(idx++, min_loc->cl_buffer());
- _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
- }
- if(max_loc != nullptr)
- {
- _kernel.setArg(idx++, max_loc->cl_buffer());
- _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
- }
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
- ICLKernel::configure_internal(win);
-}
-
-void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- static const unsigned int zero_count = 0;
- queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
- queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
deleted file mode 100644
index 5066c3b16a..0000000000
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-CLNonLinearFilterKernel::CLNonLinearFilterKernel()
- : _border_size(0)
-{
-}
-
-BorderSize CLNonLinearFilterKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
- unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
- bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_undefined);
-}
-
-void CLNonLinearFilterKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
- unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
- ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
- ARM_COMPUTE_UNUSED(mask);
-
- _input = input;
- _output = output;
- _border_size = BorderSize(mask_size / 2);
-
- // Define build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
-
- // Define kernel
- std::string pattern_name = string_from_matrix_pattern(pattern);
- std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
- std::stringstream ss;
- ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
-
- // Create kernel
- _kernel = create_kernel(compile_context, ss.str(), build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- const unsigned int num_rows_read_per_iteration = mask_size;
-
- Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
deleted file mode 100644
index 7de7735f0c..0000000000
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined);
-}
-
-void CLNonMaximaSuppression3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
-
- _input = input;
- _output = output;
-
- // Create kernel
- std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
- _kernel = create_kernel(compile_context, "non_max_suppression", build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 7c8c23238d..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,24 +21,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-using namespace arm_compute;
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+namespace arm_compute
+{
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 4;
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -49,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -59,38 +64,66 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
- const bool is_norm_accross_width = norm_idx == 0;
+ bool window_changed = false;
+ Window win;
+ const DataLayout data_layout = input->data_layout();
+ if (data_layout == DataLayout::NCHW)
+ {
+ const unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
+ const bool is_norm_across_width = norm_idx == 0;
- const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
- const BorderSize border_size = BorderSize(0, border_width);
+ const unsigned int norm_radius = norm_info.norm_size() / 2;
+ // Border / padding calculation:
+ // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+ // This means the x axis is fully-padded depending on vec_size_x and norm_size
+ // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+ // In : |p|#|#|#|p|p|
+ // Out: |#|#|#|p|
+ // The output has 1 right padding because of the vec_size_x.
+ // The input has 1 left padding because radius = 1.
+ // The input has 2 right padding because of radius = 1 AND because of the extra output padding
+ const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+ const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
+ win = calculate_max_window(*input, Steps(vec_size_x));
- // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
- // Reads can occur within the valid region of the input
- if(is_norm_accross_width)
- {
- AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
- window_changed = window_changed || update_window_and_padding(win, input_access);
+ // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
+ // Reads can occur within the valid region of the input
+ if (is_norm_across_width)
+ {
+ AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+ else
+ {
+ AccessWindowHorizontal input_access(input, -border_size.left, vec_size_x);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+
+ AccessWindowHorizontal output_access(output, 0, vec_size_x);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
}
else
{
- AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access);
+ unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+ if (norm_info.is_cross_map())
+ {
+ vec_size_x = 1;
+ }
+ win = calculate_max_window(*input, Steps(vec_size_x));
}
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -98,6 +131,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
CLNormalizationLayerKernel::CLNormalizationLayerKernel()
: _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
BorderSize CLNormalizationLayerKernel::border_size() const
@@ -110,24 +144,51 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
}
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
+ auto padding_info = get_padding_info({input, output});
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+ auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
_input = input;
_output = output;
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
- _is_norm_across_width = norm_idx == 0;
- const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
- _border_size = BorderSize(0, border_width);
+ const DataLayout data_layout = input->info()->data_layout();
+ unsigned int vec_size_x =
+ adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+ int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+ if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+ {
+ vec_size_x = 1;
+ vec_size_x_leftovers = 0;
+ }
+
+ if (data_layout == DataLayout::NCHW)
+ {
+ const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
+ _is_norm_across_width = norm_idx == 0;
+ const unsigned int norm_radius = norm_info.norm_size() / 2;
+ // Border / padding calculation:
+ // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+ // This means the x axis is fully-padded depending on vec_size_x and norm_size
+ // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+ // In : |p|#|#|#|p|p|
+ // Out: |#|#|#|p|
+ // The output has 1 right padding because of the vec_size_x.
+ // The input has 1 left padding because radius = 1.
+ // The input has 2 right padding because of radius = 1 AND the extra output padding
+ const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+ const unsigned int border_width_right =
+ _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+ _border_size = BorderSize(0, border_width_right, 0, border_width_left);
+ }
const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -137,35 +198,29 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
- build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)));
+ build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers)));
build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
- build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+ "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+ "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
// Create kernel
std::string kernel_name;
- if(norm_info.is_in_map())
+ if (norm_info.is_in_map())
{
kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
}
else
{
- if(data_layout == DataLayout::NCHW)
- {
- kernel_name = "normalization_layer_cross_map";
- }
- else
- {
- // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
- kernel_name = "normalization_layer_in_map_nchw";
- }
+ kernel_name = "normalization_layer_cross_map_" + lower_string(string_from_data_layout(data_layout));
}
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
// Set config_id for enabling LWS tuning
@@ -179,12 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ if (data_layout == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
}
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
return Status{};
}
@@ -204,6 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
+ } while (window_collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
new file mode 100644
index 0000000000..5517ba6904
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
+#define ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLNormalizationLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
+ /** Default move assignment operator */
+ CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+ * Data layouts supported: same as @p input.
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+ * Data layouts supported: same as @p input.
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ NormalizationLayerInfo norm_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+ * Data layouts supported: same as @p input.
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ BorderSize _border_size;
+ bool _is_norm_across_width;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 2ca77161ea..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,39 +21,45 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -63,11 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -76,16 +79,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
-
- if(input->data_layout() == DataLayout::NHWC)
- {
- AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal std_access(std, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mean_access, std_access);
- }
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -93,36 +89,57 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
: _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
}
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ auto padding_info = get_padding_info({input, output});
+
_input = input;
_output = output;
_mean = mean;
_std = std;
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const DataType dt = input->info()->data_type();
+ const DataLayout data_layout = input->info()->data_layout();
+
+ // Get number of elements to process per iterations
+ const unsigned int num_elems_processed_per_iteration =
+ (data_layout == DataLayout::NHWC)
+ ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+ : (16 / input->info()->element_size());
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const DataType dt = input->info()->data_type();
// Set build options
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
std::string kernel_name = "normalize_planar_yuv_layer_";
- if(is_data_type_quantized(dt))
+ if (is_data_type_quantized(dt))
{
const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -131,13 +148,22 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
}
// Create kernel
- kernel_name += lower_string(string_from_data_layout(input->info()->data_layout()));
+ kernel_name += lower_string(string_from_data_layout(data_layout));
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ if (data_layout == DataLayout::NHWC)
+ {
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ }
+ else
+ {
+ auto win_config = validate_and_configure_window_nchw(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+ }
// Set config_id for enabling LWS tuning
_config_id = "normalize_planar_yuv_layer_";
@@ -152,11 +178,17 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
_config_id += support::cpp11::to_string(input->info()->dimension(2));
}
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *std)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get()).first);
-
+ if (input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+ }
return Status{};
}
@@ -181,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
new file mode 100644
index 0000000000..341b404e3d
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
+#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the NormalizePlanarYUV layer kernel. */
+class CLNormalizePlanarYUVLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLNormalizePlanarYUVLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizePlanarYUVLayerKernel(const CLNormalizePlanarYUVLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizePlanarYUVLayerKernel &operator=(const CLNormalizePlanarYUVLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLNormalizePlanarYUVLayerKernel(CLNormalizePlanarYUVLayerKernel &&) = default;
+ /** Default move assignment operator */
+ CLNormalizePlanarYUVLayerKernel &operator=(CLNormalizePlanarYUVLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLNormalizePlanarYUVLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+ * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
+ * Data types supported: same as @p input
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+ * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
+ * Data types supported: same as @p input
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
+ *
+ * @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] mean Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
+ * @param[in] std Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
+ * Data types supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_mean;
+ const ICLTensor *_std;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 82508ec8ba..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,35 +21,44 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_UNUSED(constant_value);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
- if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+ ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
+ if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
- for(size_t i = 0; i < padding.size(); ++i)
+ for (size_t i = 0; i < padding.size(); ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
}
}
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
@@ -59,107 +68,93 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode,
- unsigned int &num_elems_processed_per_iteration)
-{
- ARM_COMPUTE_UNUSED(constant_value, mode);
-
- const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
-
- num_elems_processed_per_iteration = std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
- if(input->dimension(0) < num_elems_processed_per_iteration)
- {
- num_elems_processed_per_iteration = 1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- const int input_start_x = mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
- const int input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
-
- AccessWindowRectangle input_access(input, input_start_x, input_start_y, num_elems_processed_per_iteration, 1);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- const bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
-CLPadLayerKernel::CLPadLayerKernel()
- : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+ const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
}
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
- // Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
+ auto padding_info = get_padding_info({input, output});
+
_input = input;
_output = output;
_4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
- // Configure window
- unsigned int vec_size;
- auto win_config = validate_and_configure_window(input->info(), output->info(), padding, constant_value, mode, vec_size);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
// Set build options
- std::string kernel_name = "pad_layer_";
-
- const DataType &data_type = input->info()->data_type();
- const unsigned int input_width = input->info()->dimension(0);
- const unsigned int input_height = input->info()->dimension(1);
- const unsigned int input_depth = input->info()->dimension(2);
- const unsigned int pad_x_before = padding.at(0).first;
- const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
- const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
- const unsigned int pad_right_start = input_width + pad_x_before;
-
- _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
- _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+ const DataType &data_type = input->info()->data_type();
+ const unsigned int input_width = input->info()->dimension(0);
+ const unsigned int input_height = input->info()->dimension(1);
+ const unsigned int input_depth = input->info()->dimension(2);
+ const unsigned int pad_x_before = padding.at(0).first;
+ const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+ const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+ const unsigned int vec_size = adjust_vec_size(
+ std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+ input_width);
+ const unsigned int pad_right_start = input_width + pad_x_before;
+ const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+ const unsigned int vec_size_leftover_write =
+ vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
- if(padding.size() > 1)
+ build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
+ if (padding.size() > 1)
{
build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- if(padding.size() > 2)
+ if (padding.size() > 2)
{
build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
}
}
- switch(mode)
+ std::string kernel_name = "pad_layer_";
+ switch (mode)
{
case PaddingMode::CONSTANT:
{
kernel_name += "constant";
+ const unsigned int vec_size_leftover_read =
+ vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+
build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
- build_opts.add_option_if(pad_x_before >= vec_size, "-DNUM_THREADS_TO_SKIP_X=" + support::cpp11::to_string(pad_x_before / vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
- if(_4d_enabled)
+ if (pad_x_before >= vec_size)
+ {
+ build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
+ build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+ support::cpp11::to_string(pad_right_start / vec_size));
+ }
+ if (_4d_enabled)
{
build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -174,18 +169,19 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
- const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
- const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
- const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
- const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+ const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
+ const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
+ const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
- build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
- build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
- build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+ build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
- build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+ build_opts.add_option_if(after_pad_fact_x < output_last_x,
+ "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
break;
}
@@ -195,14 +191,21 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure window
+ Window win = calculate_max_window(*output->info(), Steps(vec_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
- unsigned int vec_size;
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), padding, constant_value, mode, vec_size).first);
-
return Status{};
}
@@ -211,25 +214,19 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window win_in = window;
- win_in.adjust(Window::DimX, _input_start_x, true);
- win_in.adjust(Window::DimY, _input_start_y, true);
-
- Window slice_out = window.first_slice_window_3D();
- Window slice_in = win_in.first_slice_window_3D();
- unsigned int batch = 0;
+ Window slice = window.first_slice_window_3D();
+ unsigned int batch = 0;
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- if(_4d_enabled)
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ if (_4d_enabled)
{
add_argument<unsigned int>(idx, batch++);
}
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+ enqueue(queue, *this, slice, lws_hint());
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
new file mode 100644
index 0000000000..dca121b6a1
--- /dev/null
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNEL_H
+#define ARM_COMPUTE_CLPADLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPadLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel(CLPadLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLPadLayerKernel() = default;
+ /** Set the input and output tensor.
+ *
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair padding[i]
+ * specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+ * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+ /** Set the input and output tensor.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair padding[i]
+ * specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+ * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair padding[i]
+ * specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
+ * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ bool _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
deleted file mode 100644
index e657c4eee0..0000000000
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-CLPermuteKernel::CLPermuteKernel()
- : _input(nullptr), _output(nullptr), _perm()
-{
-}
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
-{
- TensorShape output_shape = input->tensor_shape();
- permute(output_shape, perm);
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4,
- "Permutation upto 4-D input tensor is supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
- "Permutation vector size should be less than or equal to 4");
- for(const auto &p : perm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
- }
-
- // Validate configured output
- if(output->total_size() != 0)
- {
- const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- return Status{};
-}
-} // namespace
-
-void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
-}
-
-void CLPermuteKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
-
- _input = input;
- _output = output;
- _perm = perm;
-
- const TensorShape output_shape = get_output_shape(input->info(), perm);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
- build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
- // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector
- build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
- build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
- build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
- build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
-
- _kernel = create_kernel(compile_context, "permute", build_opts.options());
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
-
- return Status{};
-}
-
-void CLPermuteKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in, lws_hint());
- }
- while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
deleted file mode 100644
index 585715a6e6..0000000000
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(overflow_policy);
- ARM_COMPUTE_UNUSED(rounding_policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
- 1,
- DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
- "Output can only be QASYMM8 if both inputs are QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
- "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be QSYMM16 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be S32 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output, out_shape);
-
- if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output, Format::S16);
- }
- else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output, Format::F32);
- }
- else if(input1->data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(*output, DataType::QASYMM8);
- }
- else if(input1->data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
- }
- else if(input1->data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(*output, DataType::QSYMM16);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
-}
-
-void CLPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
- scale, overflow_policy, rounding_policy, act_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- int scale_int = -1;
- // Extract sign, exponent and mantissa
- int exponent = 0;
- float normalized_mantissa = std::frexp(scale, &exponent);
- // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
- // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
- // Moreover, it will be negative as we deal with 1/2^n
- if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
- {
- // Store the positive exponent. We know that we compute 1/2^n
- // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
- scale_int = std::abs(exponent - 1);
- }
-
- std::string acc_type;
- // Check if it has float inputs and output
- if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
- {
- scale_int = -1;
- acc_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
- }
- else
- {
- if(input1->info()->element_size() == 2 || input2->info()->element_size() == 2)
- {
- // Use 32-bit accumulator for 16-bit input
- acc_type = "int";
- }
- else
- {
- // Use 16-bit accumulator for 8-bit input
- acc_type = "ushort";
- }
- }
-
- const bool is_quantized = is_data_type_quantized(input1->info()->data_type());
-
- // Set kernel build options
- std::string kernel_name = "pixelwise_mul";
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- if(is_quantized && (output->info()->data_type() != DataType::S32))
- {
- const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option_if(is_data_type_quantized_asymmetric(input1->info()->data_type()),
- "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(input2->info()->data_type()),
- "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
- build_opts.add_option_if(is_data_type_quantized_asymmetric(output->info()->data_type()),
- "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
- build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- kernel_name += "_quantized";
- }
- else
- {
- kernel_name += (scale_int >= 0) ? "_int" : "_float";
- build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
- build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
- build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
- if(act_info.enabled())
- {
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- }
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set scale argument
- unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
- if(scale_int >= 0 && !is_quantized)
- {
- _kernel.setArg(idx++, scale_int);
- }
- else
- {
- _kernel.setArg(idx++, scale);
- }
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
-
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseMultiplicationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
-
-Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
- auto_init_if_empty(*output, out_info);
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLComplexPixelWiseMultiplicationKernel::CLComplexPixelWiseMultiplicationKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
-}
-
-void CLComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info(), act_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- CLBuildOptions build_opts;
- if(act_info.enabled())
- {
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLComplexPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
-
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
- ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
- return BorderSize{ 0, border, 0, 0 };
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
deleted file mode 100644
index cf1d7dd8dd..0000000000
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-#include <tuple>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-// Internal window config info
-using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
-{
- TensorShape out_shape = compute_pool_shape(*input, pool_info);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in the CL backend.");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
- "Unsupported combination of parameters!");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
- && (input->data_layout() == DataLayout::NHWC),
- "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Get data layout
- const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- unsigned int pooled_w = 0;
- unsigned int pooled_h = 0;
- int pool_size_x = pool_info.is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
- int pool_size_y = pool_info.is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_pad_right = pad_stride_info.pad_right();
- const int pool_pad_top = pad_stride_info.pad_top();
- const int pool_pad_left = pad_stride_info.pad_left();
- const int pool_pad_bottom = pad_stride_info.pad_bottom();
- BorderSize border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-
- auto_init(input, output, pool_info);
- pooled_w = output->tensor_shape()[idx_width];
- pooled_h = output->tensor_shape()[idx_height];
-
- const DataType data_type = input->data_type();
-
- const int input_width = input->dimension(idx_width);
- const int input_height = input->dimension(idx_height);
-
- unsigned int num_elems_processed_per_iteration = 0;
- bool window_changed = false;
- Window win{};
- switch(data_layout)
- {
- case DataLayout::NCHW:
- {
- // Change the number of elements processed per iteration
- // for pooling 3x3 with stride less equal than 3
- const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
- num_elems_processed_per_iteration = can_optimize ? 4 : 1;
- const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
-
- // Number of iterations in X dimension
- const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
- // Upper limit for the number of right/bottom border elements that are accessed
- const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
-
- border_size.right = std::max(upper_bound_w, pool_pad_right);
- border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
- pool_stride_x, pool_stride_y);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- break;
- }
- case DataLayout::NHWC:
- {
- num_elems_processed_per_iteration = 8;
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(input,
- 0, -1,
- ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size));
-}
-} // namespace
-
-CLPoolingLayerKernel::CLPoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
-{
-}
-
-BorderSize CLPoolingLayerKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
-}
-
-void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
- _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
- _indices = indices;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- const PoolingType pool_type = pool_info.pool_type;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const int pool_size_x = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- const bool exclude_padding = pool_info.exclude_padding;
- std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_pad_top = pad_stride_info.pad_top();
- const int pool_pad_left = pad_stride_info.pad_left();
-
- // Set build options
- CLBuildOptions build_opts;
-
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Check output dimensions
- auto_init(input->info(), output->info(), pool_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
-
- const DataType data_type = input->info()->data_type();
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
- build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
- build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
- build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
- build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
- build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
-
- // Set the initial value for the pooling operation accordingly with the data type
- if(pool_type == PoolingType::MAX)
- {
- if(is_data_type_quantized(data_type))
- {
- PixelValue type_min{};
- std::tie(type_min, std::ignore) = get_min_max(data_type);
- build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
- }
- else
- {
- build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
- }
- }
- else
- {
- // Pool AVG and Pool L2 initial value
- build_opts.add_option("-DINITIAL_VALUE=0");
- }
-
- const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
- const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
- const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
- build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
- build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
-
- // Create kernel
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
- build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
- if(pool_type != PoolingType::MAX)
- {
- build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
- }
-
- if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
- {
- // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
- // each thread computes 4 output elements
- const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
-
- std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
- + support::cpp11::to_string(pool_size_x);
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- }
- else // Run general case
- {
- std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- }
- break;
- }
- case DataLayout::NHWC:
- {
- build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
- build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
- build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
- "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
- std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- if(_data_layout == DataLayout::NCHW)
- {
- CLPoolingConfig pooling_config = std::get<2>(win_config);
- _num_elems_processed_per_iteration = pooling_config.first;
- _border_size = pooling_config.second;
- }
- else
- {
- _border_size = BorderSize(1, 0, 0, 0);
- _num_elems_processed_per_iteration = 8;
- }
-
- // Set config_id for enabling LWS tuning
- _config_id = "pooling_layer_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(idx_height));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(idx_channel));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
-}
-
-Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
-
- return Status{};
-}
-
-void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int pool_stride_x = 0;
- unsigned int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- Window slice = window_collapsed.first_slice_window_3D();
- do
- {
- // Upsample input by pool size
- Window in_slice(slice);
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
- (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
- pool_stride_x * _num_elems_processed_per_iteration));
- in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
- (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
- pool_stride_y));
-
- // Set inputs
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
- break;
- }
- case DataLayout::NHWC:
- {
- const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
-
- Window slice = window_collapsed.first_slice_window_4D();
- Window in_slice = window_collapsed.first_slice_window_4D();
- in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
- in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
- in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
- in_slice.set(3, Window::Dimension(0, total_batches, 1));
- do
- {
- // Set inputs
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, in_slice);
- add_4D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 07f669af62..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -41,7 +42,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -50,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
// Check variances
const int var_size = info.variances().size();
- if(var_size > 1)
+ if (var_size > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
- for(int i = 0; i < var_size; ++i)
+ for (int i = 0; i < var_size; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
}
@@ -61,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
- if(!info.max_sizes().empty())
+ if (!info.max_sizes().empty())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+ "Max and min sizes dimensions should match");
}
- for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+ "Max size should be greater than min size");
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
}
@@ -79,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ ITensorInfo *output,
+ const PriorBoxLayerInfo &info,
+ int num_priors)
{
ARM_COMPUTE_UNUSED(input2);
// Output tensor auto initialization if not yet initialized
@@ -87,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
auto_init_if_empty(*output, output_shape, 1, input1->data_type());
const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -98,15 +109,28 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
: _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
}
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
- cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
@@ -133,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
int img_width = info.img_size().x;
int img_height = info.img_size().y;
- if(img_width == 0 || img_height == 0)
+ if (img_width == 0 || img_height == 0)
{
img_width = input2->info()->dimension(width_idx);
img_height = input2->info()->dimension(height_idx);
@@ -141,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
float step_x = info.steps()[0];
float step_y = info.steps()[0];
- if(step_x == 0.f || step_y == 0.f)
+ if (step_x == 0.f || step_y == 0.f)
{
step_x = static_cast<float>(img_width) / layer_width;
step_y = static_cast<float>(img_height) / layer_height;
@@ -160,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
build_opts.add_option_if(info.clip(), "-DIN_PLACE");
- if(info.variances().size() > 1)
+ if (info.variances().size() > 1)
{
- for(unsigned int i = 0; i < info.variances().size(); ++i)
+ for (unsigned int i = 0; i < info.variances().size(); ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(i)));
}
}
else
{
- for(unsigned int i = 0; i < 4; ++i)
+ for (unsigned int i = 0; i < 4; ++i)
{
- build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(info.variances().at(0)));
}
}
@@ -192,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
ICLKernel::configure_internal(win_config.second);
}
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+ output->clone().get(), info, num_priors)
+ .first);
return Status{};
}
@@ -209,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
- queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
- if(!_info.max_sizes().empty())
+ queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+ _info.aspect_ratios().data());
+ if (!_info.max_sizes().empty())
{
queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
}
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
new file mode 100644
index 0000000000..a50e0c5ff5
--- /dev/null
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
+#define ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PriorBox layer kernel. */
+class CLPriorBoxLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLPriorBoxLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPriorBoxLayerKernel(const CLPriorBoxLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPriorBoxLayerKernel &operator=(const CLPriorBoxLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLPriorBoxLayerKernel(CLPriorBoxLayerKernel &&) = default;
+ /** Default move assignment operator */
+ CLPriorBoxLayerKernel &operator=(CLPriorBoxLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLPriorBoxLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] input2 Second source tensor. Data types and layouts supported: same as @p input1
+ * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
+ * @param[in] info Prior box layer info.
+ * @param[in] min Minimum prior box values
+ * @param[in] max Maximum prior box values
+ * @param[in] aspect_ratios Aspect ratio values
+ */
+ void configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] input2 Second source tensor. Data types and layouts supported: same as @p input1
+ * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
+ * @param[in] info Prior box layer info.
+ * @param[in] min Minimum prior box values
+ * @param[in] max Maximum prior box values
+ * @param[in] aspect_ratios Aspect ratio values
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info,
+ cl::Buffer *min,
+ cl::Buffer *max,
+ cl::Buffer *aspect_ratios);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
+ *
+ * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
+ * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input1
+ * @param[in] info Prior box layer info.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+ PriorBoxLayerInfo _info;
+ int _num_priors;
+ cl::Buffer *_min;
+ cl::Buffer *_max;
+ cl::Buffer *_aspect_ratios;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index d9da3cb36e..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -46,15 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
/* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
/* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
- const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+ const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+ ? input->dimension(0)
+ : temp_num_elems_processed_per_iteration;
// This kernel doesn't need padding
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
return std::make_pair(Status{}, win);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
@@ -70,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -82,11 +91,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
: _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
+ auto padding_info = get_padding_info({input, weight, bias, output});
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
@@ -100,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
int32_t output_multiplier{};
int32_t output_shift{};
const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
- const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+ const Status status =
+ quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
output_shift *= -1;
// Set build options
@@ -110,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
- build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
- build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMIN_BOUND=" +
+ support::cpp11::to_string(std::get<0>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+ build_opts.add_option("-DMAX_BOUND=" +
+ support::cpp11::to_string(std::get<1>(
+ quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
// Create kernel
_kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -128,14 +148,21 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
_config_id += support::cpp11::to_string(input->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
}
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -166,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
+ } while (window.slide_window_slice_2D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
new file mode 100644
index 0000000000..ba912e1d2d
--- /dev/null
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
+#define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to do layer normalization. */
+class CLQLSTMLayerNormalizationKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLQLSTMLayerNormalizationKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLQLSTMLayerNormalizationKernel(const CLQLSTMLayerNormalizationKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLQLSTMLayerNormalizationKernel &operator=(const CLQLSTMLayerNormalizationKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLQLSTMLayerNormalizationKernel(CLQLSTMLayerNormalizationKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLQLSTMLayerNormalizationKernel &operator=(CLQLSTMLayerNormalizationKernel &&) = default;
+ /** Default destructor */
+ ~CLQLSTMLayerNormalizationKernel() = default;
+ /** Initialise the kernel's input and outputs.
+ *
+ * @param[in] input Source tensor with 2 dimensions. Data types supported: QSYMM16.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] weight Weight tensor. Data types supported: Same as @p input.
+ * @param[in] bias Bias tensor. Data types supported: S32.
+ *
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+ /** Initialise the kernel's input and outputs.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor with 2 dimensions. Data types supported: QSYMM16.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] weight Weight tensor. Data types supported: Same as @p input.
+ * @param[in] bias Bias tensor. Data types supported: S32.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *weight,
+ const ICLTensor *bias);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
+ *
+ * @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16.
+ * @param[in] output Destination info tensor. Data type supported: same as @p input
+ * @param[in] weight Weight info tensor. Data types supported: Same as @p input.
+ * @param[in] bias Bias tensor info. Data types supported: S32.
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_weight;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
deleted file mode 100644
index b4b2217391..0000000000
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- // Output must always be initialized
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- const int vec_size_x = 16 / input->element_size();
- const int input_width_x = input->tensor_shape().x();
- const bool multi_access_x = (input_width_x / vec_size_x > 0);
- if(multi_access_x)
- {
- win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- return std::make_pair(Status{}, win);
-}
-} // namespace
-
-CLQuantizationLayerKernel::CLQuantizationLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const int vec_size_x = 16 / input->info()->element_size();
- const int input_width_x = input->info()->tensor_shape().x();
- const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
- const DataType output_data_type = output->info()->data_type();
-
- float scale_to_apply = qinfo.scale;
- int32_t offset_to_apply = qinfo.offset;
- if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
- {
- /*
- * In case of requantization of a quantized input tensor to an output tensor with another quantization
- * instead of of apply dequantization and then a quantization functions, we just compute new scale and
- * offset to apply.
- *
- * Assuming:
- * - q_i as input quantized value
- * - q_o as output quantized value
- * - z_i as input quantization offset value
- * - z_o as output quantization offset value
- * - s_i as input quantization scale value
- * - s_o as output quantization scale value
- * - z_n as new quantization offset value
- * - s_n as new quantization scale value
- *
- * q_o = ( q_i - z_i ) * s_i / s_o + z_o
- *
- * We can rewrite the formula as:
- *
- * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
- *
- * q_o = q_i / s_n + z_n
- *
- * Where:
- *
- * s_n = s_o / s_i
- *
- * z_n = - z_i * s_i / s_o + z_o
- *
- */
- const UniformQuantizationInfo qinfo_in = _input->info()->quantization_info().uniform();
- scale_to_apply /= qinfo_in.scale;
- // In order to minimize flooring we convert the offset to a float,
- // then compute the new offset in the float domain,
- // finally we convert it back as int32_t
- offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
- }
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option_if(is_data_type_float(_input->info()->data_type()), "-DIS_FLOAT");
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
- std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
- build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
- build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
- _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-}
-
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index de99223bbc..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,20 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -43,24 +43,29 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+ output->tensor_shape());
}
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
@@ -75,47 +80,37 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- const TensorShape output_shape = compute_roi_align_shape(*input, *rois, pool_info);
- auto_init_if_empty((*output), output_shape, 1, input->data_type());
- output->set_data_layout(input->data_layout());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
CLROIAlignLayerKernel::CLROIAlignLayerKernel()
: _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ output->info()->set_data_layout(input->info()->data_layout());
+
+ auto padding_info = get_padding_info({input, rois, output});
_input = input;
_output = output;
@@ -129,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
- build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
- build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+ build_opts.add_option("-DMAX_DIM_X=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+ build_opts.add_option("-DMAX_DIM_Y=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+ build_opts.add_option("-DMAX_DIM_Z=" +
+ support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+ input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
- build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+ build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+ "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
- if(is_qasymm)
+ if (is_qasymm)
{
const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -156,10 +158,16 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
const std::string kernel_name = (is_qasymm) ? "roi_align_layer_quantized" : "roi_align_layer";
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
new file mode 100644
index 0000000000..2e84e5d303
--- /dev/null
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
+#define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the RoIAlign kernel.
+ */
+class CLROIAlignLayerKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLROIAlignLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLROIAlignLayerKernel(const CLROIAlignLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLROIAlignLayerKernel &operator=(const CLROIAlignLayerKernel &) = delete;
+ /** Default Move Constructor. */
+ CLROIAlignLayerKernel(CLROIAlignLayerKernel &&) = default;
+ /** Default move assignment operator. */
+ CLROIAlignLayerKernel &operator=(CLROIAlignLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLROIAlignLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
+ * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
+ * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] rois ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
+ * otherwise same as @p input
+ * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ *
+ * @return a Status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_rois;
+ ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H*/
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index a5b80eb5ef..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,85 +21,86 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
+#include <cfloat>
#include <cmath>
-#include <set>
#include <string>
namespace arm_compute
{
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
+ : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto initialization if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
- auto_init_if_empty((*output), output_shape, 1, input->data_type());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ _type = CLKernelType::ELEMENTWISE;
}
-} // namespace
-CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
- : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+
+ //Validate arguments
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+ (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+ }
+
+ return Status{};
}
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
- //Validate arguments
- ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
- ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
- ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
- ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-
- if(output->info()->total_size() != 0)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
- }
+ auto padding_info = get_padding_info({input, rois, output});
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+ rois->info()->dimension(1));
+ auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+ output->info()->quantization_info());
// Set instance variables
_input = input;
@@ -107,27 +108,46 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
_output = output;
_pool_info = pool_info;
+ const DataType data_type = input->info()->data_type();
+ const bool is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
- build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
- build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
- build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
- build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
- build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(data_type));
+ build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+ build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+ build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+ build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+ build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+ build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
+
+ if (is_qasymm)
+ {
+ // Determine quantization info scale, offset
+ UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
+ uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+ _output->info()->quantization_info().uniform());
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
+
+ // Specify minimum possible value of datatype
+ build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(0));
+ }
+ else
+ {
+ // Specify min value of F32 datatype
+ build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(-FLT_MAX));
+ }
+
+ Window win = calculate_max_window(*(output->info()), Steps());
+ ICLKernel::configure_internal(win);
// Create kernel
std::string kernel_name = "roi_pooling_layer";
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
- add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
- add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
new file mode 100644
index 0000000000..80bfb63092
--- /dev/null
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the ROI pooling layer kernel */
+class CLROIPoolingLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLROIPoolingLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLROIPoolingLayerKernel(const CLROIPoolingLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLROIPoolingLayerKernel &operator=(const CLROIPoolingLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLROIPoolingLayerKernel(CLROIPoolingLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLROIPoolingLayerKernel &operator=(CLROIPoolingLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLROIPoolingLayerKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32.
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ /** Static Validate function to check inputs will lead to valid configuration of @ref CLROIPoolingLayer
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8
+ * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+ * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+ *
+ * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+ * width and pooled height.
+ * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+ * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_rois;
+ const ICLTensor *_output;
+ ROIPoolingLayerInfo _pool_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 1e97649e0a..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,74 +21,57 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Utils.h"
-#include "support/StringSupport.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
-using namespace arm_compute;
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
-namespace
+namespace arm_compute
{
-unsigned int get_num_elems_processed_per_iteration(const DataType dt)
+namespace
{
- unsigned int num_elems_processed_per_iteration = preferred_vector_width(CLKernelLibrary::get().get_device(), dt);
- if(num_elems_processed_per_iteration > 8)
- {
- num_elems_processed_per_iteration = 8; //kernel uses only 8 lanes.
- }
- return num_elems_processed_per_iteration;
-}
+constexpr unsigned int vector_size_byte_opencl = 16;
-Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
+Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
- 1,
- DataType::U8, DataType::S8, DataType::QASYMM8,
- DataType::U16, DataType::S16,
- DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
-
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+ "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+ "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+ "step value is outside the range of the data type");
- return Status{};
-}
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+ "Output tensor size is incorrect");
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo &output, const float start, const float end, const float step)
-{
- unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output.data_type());
- // Auto initialize output if not initialized
- auto_init_if_empty(output, TensorShape(num_of_elements_in_range(start, end, step)), 1, output.data_type(), output.quantization_info());
-
- // Configure kernel window
- Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), TensorShape(num_of_elements_in_range(start, end, step))));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return Status{};
}
} // namespace
-CLRangeKernel::CLRangeKernel()
- : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
@@ -96,15 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
}
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+ const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
// Configure kernel window
- auto win_config = validate_and_configure_window(*(output->info()), start, end, step);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ unsigned int num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ auto padding_info = get_padding_info({output});
_start = start;
_end = end;
@@ -113,14 +99,15 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
std::string kernel_name = "range";
- unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output->info()->data_type());
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+ support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
- if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(output->info()->data_type()))
{
const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
@@ -129,7 +116,7 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
}
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ ICLKernel::configure_internal(win);
// Set config_id for enabling LWS tuning
_config_id = kernel_name;
@@ -137,15 +124,12 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
_config_id += lower_string(string_from_data_type(output->info()->data_type()));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLRangeKernel::validate(const ITensorInfo *output, const float start, const float end, const float step)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*output, start, end, step));
- ARM_COMPUTE_RETURN_ON_ERROR((validate_and_configure_window(*(output->clone()), start, end, step)).first);
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(output, start, end, step));
return Status{};
}
@@ -158,3 +142,4 @@ void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
enqueue(queue, *this, window, lws_hint());
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
new file mode 100644
index 0000000000..65251a11e5
--- /dev/null
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLRANGEKERNEL_H
+#define ARM_COMPUTE_CLRANGEKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Kernel class for Range
+ *
+ * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
+ * of 'step' up to but not including 'end'.
+ */
+class CLRangeKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLRangeKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRangeKernel(const CLRangeKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRangeKernel &operator=(const CLRangeKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRangeKernel(CLRangeKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRangeKernel &operator=(CLRangeKernel &&) = default;
+ /** Default destructor */
+ ~CLRangeKernel() = default;
+ /** Initialize the kernel's output tensor, start, end and step of the sequence.
+ *
+ * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+ * @param[in] start The starting value of the sequence.
+ * @param[in] end The ending (not including) value of the sequence.
+ * @param[in] step The gap between each pair of values in the sequence.
+ */
+ void configure(ICLTensor *output, float start, float end, float step);
+ /** Initialize the kernel's output tensor, start, end and step of the sequence.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+ * @param[in] start The starting value of the sequence.
+ * @param[in] end The ending (not including) value of the sequence.
+ * @param[in] step The gap between each pair of values in the sequence.
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLRangeKernel
+ *
+ * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+ * @param[in] start The starting value of the sequence.
+ * @param[in] end The ending (not including) value of the sequence.
+ * @param[in] step The gap between each pair of values in the sequence.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *output, float start, float end, float step);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ float _start; /**< Start of sequence */
+ float _end; /**< End of sequence */
+ float _step; /**< Increment/step value */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLRANGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 33e71445c4..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,48 +21,54 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-// OpenCL kernel requires input width to be a power of 2 for x-axis.
-constexpr unsigned int border_val = 64;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- if(input->num_channels() == 1)
+ if (input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+ "Not supported reduction operation for QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (width == 0) && (input->data_type() != DataType::QASYMM8) && (input->data_type() != DataType::QASYMM8_SIGNED));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+ ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+ (input->data_type() != DataType::QASYMM8) &&
+ (input->data_type() != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+ "Not supported reduction operation, use CLArgMinMaxLayer");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -70,85 +76,50 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
return Status{};
}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
- // Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, true);
- DataType output_data_type = input->data_type();
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
- const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
- const bool is_serial_op = needs_serialized_reduction(op, input->data_type(), axis);
-
- switch(axis)
- {
- case 0:
- {
- if(!is_serial_op)
- {
- const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
- AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1);
- AccessWindowHorizontal output_access(output, 0, 1);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- }
- break;
- case 1:
- case 2:
- case 3:
- {
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
- return std::make_tuple(err, win);
-}
} // namespace
CLReductionOperationKernel::CLReductionOperationKernel()
- : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
-{
-}
-
-BorderSize CLReductionOperationKernel::border_size() const
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
{
- return _border_size;
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
+void CLReductionOperationKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, width);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
}
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
_reduction_axis = axis;
_op = op;
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+
// Set build options
CLBuildOptions build_opts;
DataType data_type = input->info()->data_type();
std::string data_type_promoted{};
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
data_type_promoted = "int";
}
@@ -157,8 +128,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
data_type_promoted = get_cl_type_from_data_type(data_type);
}
+ const unsigned int width = input->info()->dimension(0) * input->info()->num_channels();
+ unsigned int vec_size = (is_data_type_quantized(input->info()->data_type()) && (axis == 0)) ? 1 : 16;
+ vec_size = adjust_vec_size(vec_size, width);
+ const unsigned int vec_size_leftover = width % vec_size;
+
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
build_opts.add_option_if(is_data_type_float(data_type), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
@@ -166,11 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
- build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
- build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
- switch(op)
+ build_opts.add_option_if(is_data_type_quantized(data_type),
+ "-DOFFSET=" +
+ support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+ build_opts.add_option_if(
+ is_data_type_quantized(data_type),
+ "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+ switch (op)
{
case ReductionOperation::SUM_SQUARE:
build_opts.add_option(("-DOPERATION=square_sum"));
@@ -180,7 +161,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
build_opts.add_option(("-DOPERATION=sum"));
break;
case ReductionOperation::MIN:
+ build_opts.add_option(("-DOPERATION=min_"));
+ break;
case ReductionOperation::MAX:
+ build_opts.add_option(("-DOPERATION=max_"));
break;
case ReductionOperation::PROD:
build_opts.add_option(("-DOPERATION=product"));
@@ -190,30 +174,15 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
}
// Create kernel
- cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(axis)
+ switch (axis)
{
case 0:
{
- if(is_serial_op)
- {
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
- kernel_axis_name = "non_parallel_x";
- }
- else
- {
- build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
- const unsigned int width_leftover = input->info()->dimension(0) % border_val;
- const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0;
- kernel_axis_name = "x";
-
- lws_hint = create_lws_hint_parallel_implementations(input->info()->dimension(0), border_val);
- _border_size = BorderSize(0, border_width, 0, 0);
- }
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
+ kernel_axis_name = ((is_serial_op) ? "non_parallel_x" : "x");
}
break;
case 1:
@@ -235,18 +204,21 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
_kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+ TensorShape actual_input_shape = input->info()->tensor_shape();
+ actual_input_shape[0] = width;
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
+ ICLKernel::configure_internal(win);
- ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
return Status{};
}
@@ -256,18 +228,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
- switch(_reduction_axis)
+ switch (_reduction_axis)
{
case 0:
{
// We use parallel reduction only in non quantized types
- if(is_serial_op)
+ if (is_serial_op)
{
// Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ Window window_in{window};
+ window_in.set(Window::DimX,
+ Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
- Window out_window{ window };
+ Window out_window{window};
out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
Window in_slice = window_in.first_slice_window_1D();
@@ -278,91 +251,114 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
unsigned int idx = 0;
add_1D_tensor_argument(idx, _input, in_slice);
add_1D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ enqueue(queue, *this, in_slice);
+ } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
else
{
// Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+ bool has_collapsed = true;
+ Window window_in = window.collapse_if_possible(window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- // Set local sums buffer
- unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
+ Window window_out = window_in;
+ window_out.set(0, Window::Dimension());
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, window_in);
+ add_3D_tensor_argument(idx, _output, window_out);
+ enqueue(queue, *this, window_in);
}
}
break;
case 1:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
- Window in_slice = window_in.first_slice_window_2D();
- Window out_slice = window.first_slice_window_2D();
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 2:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
- Window in_slice = window_in.first_slice_window_3D();
- Window out_slice = window.first_slice_window_3D();
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 3:
{
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(3, Window::Dimension(0, 1, 1));
- Window in_slice = window_in.first_slice_window_4D();
- Window out_slice = window.first_slice_window_4D();
+ bool has_collapsed = true;
+ Window actual_window = window.shift_dimensions(1, Window::DimW);
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, in_slice);
- add_4D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
new file mode 100644
index 0000000000..2f94b2add3
--- /dev/null
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
+#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ */
+class CLReductionOperationKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLReductionOperationKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReductionOperationKernel(const CLReductionOperationKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReductionOperationKernel &operator=(const CLReductionOperationKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReductionOperationKernel(CLReductionOperationKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReductionOperationKernel &operator=(CLReductionOperationKernel &&) = default;
+ /** Default destructor */
+ ~CLReductionOperationKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ unsigned int _reduction_axis;
+ ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
deleted file mode 100644
index dcc425b1fc..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLRemapKernel::CLRemapKernel()
- : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
-{
-}
-
-BorderSize CLRemapKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_undefined);
-}
-
-void CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
- ARM_COMPUTE_UNUSED(border_undefined);
-
- _input = input;
- _output = output;
- _map_x = map_x;
- _map_y = map_y;
-
- // Create kernel
- std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
- std::string interpolation_name = string_from_interpolation_policy(policy);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "remap_" + interpolation_name;
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- const int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
- const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
- Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set static arguments
- unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
- _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
-}
-
-void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- add_2D_tensor_argument(idx, _map_x, slice);
- add_2D_tensor_argument(idx, _map_y, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 065e25ea41..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,17 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
#include <string>
@@ -50,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+ "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+ "The height of the input tensor must be a multiple of stride");
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ const TensorInfo tensor_info_output =
+ output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -65,9 +70,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLReorgLayerKernel::CLReorgLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
@@ -75,16 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
}
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t stride)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+ auto padding_info = get_padding_info({input, output});
_input = input;
_output = output;
- std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
- const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ std::string kernel_name =
+ std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+ const size_t idx_channel =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
// Create kernel
CLBuildOptions build_opts;
@@ -95,12 +106,13 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
// Configure window
// auto inizialize the output tensor if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
Window win = calculate_max_window(*output->info(), Steps());
// The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
_config_id = kernel_name;
@@ -114,9 +126,12 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
_config_id += support::cpp11::to_string(input->info()->dimension(2));
_config_id += "_";
_config_id += support::cpp11::to_string(stride);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+ const arm_compute::ITensorInfo *output,
+ int32_t stride)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
@@ -136,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ } while (window.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
new file mode 100644
index 0000000000..f335071e9f
--- /dev/null
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLREORGLAYERKERNEL_H
+#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a reorg layer */
+class CLReorgLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLReorgLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLReorgLayerKernel(const CLReorgLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLReorgLayerKernel &operator=(const CLReorgLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReorgLayerKernel(CLReorgLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReorgLayerKernel &operator=(CLReorgLayerKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Destination tensor with tensor shape:
+ * [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+ * the same number of input elements. Data types supported: same as @p input.
+ * @param[in] stride Stride value to use for reorganizing the values in the output tensor.
+ * It defines the spatial distance between 2 consecutive pixels in the x and y direction
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, int32_t stride);
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Destination tensor with tensor shape:
+ * [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+ * the same number of input elements. Data types supported: same as @p input.
+ * @param[in] stride Stride value to use for reorganizing the values in the output tensor.
+ * It defines the spatial distance between 2 consecutive pixels in the x and y direction
+ */
+ void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLReorgLayerKernel
+ *
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[in] output Destination tensor with tensor shape:
+ * [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
+ * the same number of input elements. Data types supported: same as @p input. Data types supported: same as @p input.
+ * @param[in] stride Stride value to use for reorganizing the values in the output tensor
+ * It defines the spatial distance between 2 consecutive pixels in the x and y direction
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLREORGLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
deleted file mode 100644
index ce792489c5..0000000000
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-
-#include <string>
-
-/** [CLReshapeLayerKernel Kernel] **/
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
-
- return Status{};
-}
-
-} // namespace
-
-CLReshapeLayerKernel::CLReshapeLayerKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLReshapeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- // Create kernel
- std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()) };
- _kernel = create_kernel(compile_context, "reshape_layer", build_opts);
-
- // Add static arguments
- const cl_int2 input_shape =
- {
- {
- static_cast<cl_int>(_input->info()->tensor_shape()[0]),
- static_cast<cl_int>(_input->info()->tensor_shape()[1])
- }
- };
- const cl_int2 output_shape =
- {
- {
- static_cast<cl_int>(_output->info()->tensor_shape()[0]),
- static_cast<cl_int>(_output->info()->tensor_shape()[1])
- }
- };
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg<cl_int2>(idx++, input_shape);
- _kernel.setArg<cl_int2>(idx++, output_shape);
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info());
-
- // Set the output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure_internal(win);
-}
-
-Status CLReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
- return Status{};
-}
-
-void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
-
- // Set inputs
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, window_collapsed);
- add_3D_tensor_argument(idx, _output, window_collapsed);
- enqueue(queue, *this, slice, lws_hint());
-}
-/** [CLReshapeLayerKernel Kernel] **/
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index d2a3809359..00241b161b 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,33 +21,40 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
{
+ ARM_COMPUTE_UNUSED(use_inverted_axis);
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+ "Current implementation only supports up to 4 dimensions.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,19 +65,27 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLReverseKernel::CLReverseKernel()
- : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
}
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+ auto padding_info = get_padding_info({input, output, axis});
_input = input;
_output = output;
@@ -79,12 +94,14 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+ build_opts.add_option("-DRANK=" + support::cpp11::to_string(input->info()->num_dimensions()));
+ build_opts.add_option_if(use_inverted_axis, "-DUSE_INVERTED_AXIS");
// Create kernel
_kernel = create_kernel(compile_context, "reverse", build_opts.options());
@@ -109,11 +126,15 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
_config_id += support::cpp11::to_string(input->info()->dimension(1));
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
return Status{};
}
@@ -133,7 +154,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
add_1D_tensor_argument(idx, _axis, axis_slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
new file mode 100644
index 0000000000..a630aec15a
--- /dev/null
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reverse kernel */
+class CLReverseKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLReverseKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReverseKernel(const CLReverseKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReverseKernel &operator=(const CLReverseKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReverseKernel(CLReverseKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReverseKernel &operator=(CLReverseKernel &&) = default;
+ /** Default destructor */
+ ~CLReverseKernel() = default;
+ /** Initialise the kernel's inputis and output
+ *
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+ *
+ * @note The value of each axis should be between [-rank, rank)
+ * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+ *
+ * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+ *
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
+ /** Initialise the kernel's inputis and output
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
+ *
+ * @param[in] input Input tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+ *
+ * @return a status
+ */
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ const ICLTensor *_axis;
+};
+} // namespace arm_compute
+#endif // ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
deleted file mode 100644
index f41664f4e0..0000000000
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-inline std::pair<float, float> calculate_scale_factors(const ITensorInfo &input, const ITensorInfo &output, bool align_corners)
-{
- DataLayout data_layout = input.data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source width/height and destination width/height
- const unsigned int input_width = input.dimension(idx_width);
- const unsigned int input_height = input.dimension(idx_height);
- const unsigned int output_width = output.dimension(idx_width);
- const unsigned int output_height = output.dimension(idx_height);
-
- float wr = arm_compute::calculate_resize_ratio(input_width, output_width, align_corners);
- float hr = arm_compute::calculate_resize_ratio(input_height, output_height, align_corners);
-
- return std::make_pair(wr, hr);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, bool align_corners)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output == input);
-
- if(align_corners)
- {
- // For bilinear method with aligned corners, the resize ratio will
- // be calculated by (input_size - 1)/(output_size - 1). Belows are
- // checking possible overflows.
- const auto data_layout = input->data_layout();
- const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- const auto input_width = input->dimension(width_index);
- const auto input_height = input->dimension(height_index);
- const auto output_width = output->dimension(width_index);
- const auto output_height = output->dimension(height_index);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input_width == 0 || input_height == 0 || output_width == 0 || output_height == 0);
- ARM_COMPUTE_RETURN_ERROR_ON((output_width - 1 == 0) || (output_height - 1 == 0));
- }
-
- float wr = 0.f;
- float hr = 0.f;
- std::tie(wr, hr) = calculate_scale_factors(*input, *output, align_corners);
-
- ARM_COMPUTE_RETURN_ERROR_ON(policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, BorderSize &border)
-{
- Window win{};
- bool window_changed{};
- unsigned int num_elems_processed_per_iteration = 0;
- DataLayout data_layout = input->data_layout();
-
- switch(data_layout)
- {
- case DataLayout::NCHW:
- {
- if(border_mode == BorderMode::UNDEFINED)
- {
- border = BorderSize(0);
- }
-
- num_elems_processed_per_iteration = 4;
- // Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input,
- -border.left, -border.top,
- input->dimension(0) + border.right,
- input->dimension(1) + border.bottom);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
- output->tensor_shape(),
- policy,
- sampling_policy,
- border_mode == BorderMode::UNDEFINED));
-
- window_changed = update_window_and_padding(win, input_access, output_access);
- }
- break;
- case DataLayout::NHWC:
- {
- num_elems_processed_per_iteration = 1;
- // Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input, -border.left, -border.top,
- input->dimension(0) + border.right,
- input->dimension(1) + border.bottom);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CLScaleKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
- BorderMode border_mode, SamplingPolicy sampling_policy, bool align_corners)
-{
- BorderSize border = BorderSize(1);
- const bool is_align_corners = policy == InterpolationPolicy::BILINEAR
- && sampling_policy == SamplingPolicy::TOP_LEFT
- && align_corners;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, is_align_corners));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), policy, border_mode, sampling_policy, border).first);
-
- return Status{};
-}
-
-const ICLTensor *CLScaleKernel::input() const
-{
- return _input;
-}
-
-const ICLTensor *CLScaleKernel::output() const
-{
- return _output;
-}
-
-void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, bool align_corners)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, sampling_policy, align_corners);
-}
-
-void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy,
- bool align_corners)
-{
- _align_corners = policy == InterpolationPolicy::BILINEAR
- && sampling_policy == SamplingPolicy::TOP_LEFT
- && align_corners;
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, _align_corners));
-
- _input = input;
- _output = output;
- _interpolationPolicy = policy;
- _data_layout = input->info()->data_layout();
-
- float wr = 0.f;
- float hr = 0.f;
- std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info(), _align_corners);
-
- const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
-
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const bool is_nhwc = _data_layout == DataLayout::NHWC;
-
- // Compute actual border size
- BorderSize border = border_size();
-
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
- {
- policy = InterpolationPolicy::NEAREST_NEIGHBOR;
- }
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), policy, border_mode, sampling_policy, border);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
- build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
- build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
- if(call_quantized_kernel)
- {
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
- }
-
- std::string interpolation_name = string_from_interpolation_policy(policy);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "scale_" + interpolation_name;
- kernel_name += call_quantized_kernel ? "_quantized_" : "_";
- kernel_name += lower_string(string_from_data_layout(_data_layout));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-
- const unsigned int input_width = input->info()->dimension(idx_width);
- const unsigned int input_height = input->info()->dimension(idx_height);
-
- _kernel.setArg<float>(idx++, input_width);
- _kernel.setArg<float>(idx++, input_height);
- _kernel.setArg<float>(idx++, wr);
- _kernel.setArg<float>(idx++, hr);
-
- // Set config_id for enabling LWS tuning
- _config_id = "scale_";
- _config_id += (border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
- _config_id += (sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
- _config_id += (is_nhwc ? "nhwc" : "nchw");
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(3));
-}
-
-void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
- break;
- }
- case DataLayout::NHWC:
- {
- Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_4D();
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
-}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
deleted file mode 100644
index cb657446f2..0000000000
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLScharr3x3Kernel::CLScharr3x3Kernel()
- : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-BorderSize CLScharr3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLScharr3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_scharr_x = output_x != nullptr;
- _run_scharr_y = output_y != nullptr;
-
- if(_run_scharr_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
- }
-
- if(_run_scharr_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
- }
-
- _input = input;
- _output_x = output_x;
- _output_y = output_y;
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_scharr_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_scharr_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "scharr3x3", build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument_if((_run_scharr_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_scharr_y), idx, _output_y, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index 2789764d10..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,17 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -40,7 +42,7 @@ namespace
{
Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(x);
ARM_COMPUTE_RETURN_ERROR_ON(x->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
@@ -49,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
- ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+ ((c->tensor_shape().num_dimensions() > 1) ||
+ (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
- if(output != nullptr && output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -59,55 +63,18 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *c, ITensorInfo *x, ITensorInfo *y, ITensorInfo *output)
-{
- if(output != nullptr)
- {
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *x->clone());
- }
-
- const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
-
- const unsigned int num_elems_processed_per_iteration = 16 / x->element_size();
-
- // Configure kernel window
- Window win = calculate_max_window(*x, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal x_access(x, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal y_access(y, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, x_access, y_access);
-
- // Update window for condition
- if(is_same_rank)
- {
- AccessWindowHorizontal c_access(c, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, c_access);
- }
-
- // Update window for output
- if(output != nullptr)
- {
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, x->valid_region());
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
-CLSelectKernel::CLSelectKernel()
- : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
{
-}
-void CLSelectKernel::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -118,24 +85,26 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
_output = output;
_has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
- const unsigned int num_elems_processed_per_iteration = 16 / x->info()->element_size();
+ auto padding_info = get_padding_info({c, x, y, output});
+ const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
+ const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
// Set build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(x->info()->data_type()));
- build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(x->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(x->info()->element_size()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
// Create kernel
std::string kernel_name = "select";
- if(_has_same_rank)
+ if (_has_same_rank)
{
kernel_name += "_same_rank";
}
else
{
const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
- if(is_input_rank_greater_than_two)
+ if (is_input_rank_greater_than_two)
{
const size_t width = x->info()->tensor_shape().x();
const size_t height = x->info()->tensor_shape().y();
@@ -149,9 +118,9 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(c->info(), x->info(), y->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ auto_init_if_empty(*output->info(), *x->info()->clone());
+ Window win = calculate_max_window(*x->info(), Steps(vec_size_x));
+ ICLKernel::configure_internal(win);
_config_id = "select_";
_config_id += string_from_data_type(x->info()->data_type());
@@ -161,12 +130,13 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
_config_id += support::cpp11::to_string(x->info()->dimension(1));
_config_id += "_";
_config_id += support::cpp11::to_string(x->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(c->clone().get(), x->clone().get(), y->clone().get(), output->clone().get()).first);
return Status{};
}
@@ -178,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = collapsed.first_slice_window_3D();
- if(!_has_same_rank)
+ if (!_has_same_rank)
{
Window vector_slice = window.first_slice_window_1D();
vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -189,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
do
{
unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
- if(_has_same_rank)
+ if (_has_same_rank)
{
add_3D_tensor_argument(idx, _c, slice);
}
@@ -198,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
+ } while (collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
new file mode 100644
index 0000000000..c4256fd743
--- /dev/null
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSELECTKERNEL_H
+#define ARM_COMPUTE_CLSELECTKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** OpenCL interface for executing the select kernel
+ *
+ * Select is computed by:
+ * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
+ **/
+class CLSelectKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSelectKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSelectKernel(const CLSelectKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSelectKernel &operator=(const CLSelectKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSelectKernel(CLSelectKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSelectKernel &operator=(CLSelectKernel &&) = default;
+ /** Default destructor */
+ ~CLSelectKernel() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] c Condition input tensor. Data types supported: U8.
+ * @param[in] x First input tensor. Data types supported: All.
+ * @param[out] y Second input tensor. Data types supported: Same as @p x
+ * @param[in] output Output tensor. Data types supported: Same as @p x.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
+ *
+ * @param[in] c Condition input tensor. Data types supported: U8.
+ * @param[in] x First input tensor. Data types supported: All.
+ * @param[in] y Second input tensor. Data types supported: Same as @p x
+ * @param[in] output Output tensor. Data types supported: Same as @p x.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_c; /**< Condition tensor */
+ const ICLTensor *_x; /**< Source tensor 1 */
+ const ICLTensor *_y; /**< Source tensor 2 */
+ ICLTensor *_output; /**< Destination tensor */
+ bool _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLWHEREKERNEL_H */
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
deleted file mode 100644
index 12d04d99fe..0000000000
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel3x3Kernel::CLSobel3x3Kernel()
- : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel3x3Kernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_sobel_x = output_x != nullptr;
- _run_sobel_y = output_y != nullptr;
-
- if(_run_sobel_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
- }
-
- if(_run_sobel_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
- }
-
- _input = input;
- _output_x = output_x;
- _output_y = output_y;
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_sobel_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_sobel_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("sobel3x3");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 3;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
deleted file mode 100644
index a60bb0b838..0000000000
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel5x5HorKernel::CLSobel5x5HorKernel()
- : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel5x5HorKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_sobel_x = output_x != nullptr;
- _run_sobel_y = output_y != nullptr;
-
- if(_run_sobel_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
- }
-
- if(_run_sobel_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
- }
-
- _input = input;
- _output_x = output_x;
- _output_y = output_y;
- _border_size = BorderSize(border_undefined ? 0 : 2, 2);
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_sobel_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_sobel_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("sobel_separable1x5");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
-
- Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel5x5VertKernel::CLSobel5x5VertKernel()
- : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel5x5VertKernel::border_size() const
-{
- return BorderSize{ 2, 0 };
-}
-
-void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_sobel_x = output_x != nullptr;
- _run_sobel_y = output_y != nullptr;
-
- if(_run_sobel_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
- }
-
- if(_run_sobel_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
- }
-
- _input_x = input_x;
- _input_y = input_y;
- _output_x = output_x;
- _output_y = output_y;
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_sobel_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_sobel_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("sobel_separable5x1");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 5;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
- add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
- _kernel.setArg(idx++, 0 /*dummy*/);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
deleted file mode 100644
index a5fbe54678..0000000000
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-CLSobel7x7HorKernel::CLSobel7x7HorKernel()
- : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize CLSobel7x7HorKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_sobel_x = output_x != nullptr;
- _run_sobel_y = output_y != nullptr;
-
- if(_run_sobel_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
- }
-
- if(_run_sobel_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
- }
-
- _input = input;
- _output_x = output_x;
- _output_y = output_y;
- _border_size = BorderSize(border_undefined ? 0 : 3, 3);
-
- // Construct kernel name
- const std::string kernel_name = "sobel_separable1x7";
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_sobel_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_sobel_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
-
- Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
-
-CLSobel7x7VertKernel::CLSobel7x7VertKernel()
- : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize CLSobel7x7VertKernel::border_size() const
-{
- return BorderSize{ 3, 0 };
-}
-
-void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined);
-}
-
-void CLSobel7x7VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
- _run_sobel_x = output_x != nullptr;
- _run_sobel_y = output_y != nullptr;
-
- if(_run_sobel_x)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
- }
-
- if(_run_sobel_y)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
- }
-
- _input_x = input_x;
- _input_y = input_y;
- _output_x = output_x;
- _output_y = output_y;
-
- // Set build options
- std::set<std::string> build_opts;
-
- if(_run_sobel_x)
- {
- build_opts.insert("-DGRAD_X");
- }
-
- if(_run_sobel_y)
- {
- build_opts.insert("-DGRAD_Y");
- }
-
- // Create kernel
- const std::string kernel_name = std::string("sobel_separable7x1");
- _kernel = create_kernel(compile_context, kernel_name, build_opts);
-
- const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 7;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
- AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
- AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
-
- output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
- output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(border_undefined);
-}
-
-void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
-
- add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice);
- add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice);
- add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice);
-
- _kernel.setArg(idx++, 0 /*dummy*/);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
deleted file mode 100644
index 09deb94a85..0000000000
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- * it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
- // Number of integer bits in temporary fixed-point representation of current-to-max difference
- static const int scaled_diff_int_bits = 5;
- // Number of integer bits used in temporary fixed-point representation of exponent accumulator
- static const int exp_accumulation_in_bits = 12;
-
- const double beta_multiplier = std::min(
- 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
- (1LL << 31) - 1.0);
- int input_beta_multiplier;
- int input_beta_left_shift;
- quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
- const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
- const int diff_min = -1.f * std::floor(max_input_rescaled);
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
- build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
- build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
- build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
- build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
- return build_opts;
-}
-
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
-
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- if(is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- // Checks performed when sum is configured
- if(sum->total_size() != 0)
- {
- if(is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
- }
-
- return Status{};
-}
-
-Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-
- // Note: output should always have a scale of 1/256 and offset 0
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- if(!is_quantized_asymmetric)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
- }
- }
-
- return Status{};
-}
-
-// Window validation
-
-std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum, input->clone()->set_tensor_shape(max->tensor_shape()));
- auto_init_if_empty(*output, *input->clone());
-
- CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo parallel_reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(input->dimension(0));
- unsigned int vector_size = std::get<1>(parallel_reduction_info);
- const unsigned int num_elems_x = ceil_to_multiple(input->tensor_shape().x(), vector_size);
- Window win = calculate_max_window(*input, Steps(num_elems_x));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_x);
- AccessWindowHorizontal max_access(max, 0, 1);
- AccessWindowHorizontal output_access(output, 0, num_elems_x);
- AccessWindowHorizontal sum_access(sum, 0, 1);
-
- bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
- output_access.set_valid_region(win, input->valid_region());
- sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *input, ITensorInfo *output, ITensorInfo *sum, const SoftmaxKernelInfo &info)
-{
- const DataType output_data_type = info.input_data_type;
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output,
- input->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowStatic sum_access(sum, 0, 0, 1, sum->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
- : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, max, output, sum, info);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
- auto_init_if_empty(*output->info(), *input->info()->clone());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
- _input = input;
- _max = max;
- _output = output;
- _sum = sum;
-
- const DataType dt = input->info()->data_type();
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const size_t reduction_dim_size = input->info()->dimension(0);
- const float beta = info.beta;
- const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
- const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
- build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
- build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
- build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
- build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
- build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
- build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
- cl::NDRange lws_hint(cl::NullRange);
- std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
- std::string("softmax_layer_max_shift_exp_sum_serial");
- ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
- unsigned int vector_size = std::get<1>(parallel_reduction_info);
-
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
- build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-
- // Configure parallel kernel if needed
- if(std::get<0>(parallel_reduction_info))
- {
- kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
- bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
- build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
-
- // Handle boundary conditions.
- const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
- build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
- // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
- // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
- lws_hint = cl::NDRange(_grid_size);
- }
-
- // Create kernel.
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set static arguments. Both the kernels use the same arguments
- unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
-
- // Configure window
- auto win_config = validate_and_configure_window_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second, lws_hint);
-}
-
-Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMaxShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
- return Status{};
-}
-
-CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
- bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
- unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
- return std::make_tuple(is_parallel_reduction, vector_size);
-}
-
-void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Collapse window in Z dimension
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Reconfigure window in case of parallel reduction
- ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
- if(std::get<0>(parallel_reduction_info))
- {
- // To launch grid_size parallel workitems, steps.x should be modified as follows.
- const unsigned int step = std::get<1>(parallel_reduction_info);
- window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
- }
-
- // Get slices
- Window slice = window_collapsed.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _max, slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_3D_tensor_argument(idx, _sum, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-CLLogits1DNormKernel::CLLogits1DNormKernel()
- : _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, info);
-}
-
-void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-
- // Note: output should always have a scale of 1/256 and offset 0
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
- const DataType output_data_type = info.input_data_type;
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
- const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(),
- input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(input->info(), sum->info(), output->info(), info));
-
- _input = input;
- _sum = sum;
- _output = output;
-
- const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
- const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
- build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
- build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
- build_opts.add_options_if(is_quantized_asymmetric,
- prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
- build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
- // Create kernel
- std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure window
- auto win_config = validate_and_configure_window_1DNorm(input->info(), output->info(), sum->info(), info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DNorm(input->clone().get(), output->clone().get(), sum->clone().get(), info).first);
-
- return Status{};
-}
-
-void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- Window sum_slice = slice;
- sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _sum, sum_slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 5900b085e8..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -35,47 +39,50 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *block_info,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
+ ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const DataLayout data_layout = input->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+ TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input, block_shape_x, block_shape_y, padding_left, padding_right);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -87,17 +94,27 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
: _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ auto padding_info = get_padding_info({input, block_shape, paddings, output});
_input = input;
_block_shape = block_shape;
@@ -111,36 +128,52 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
- _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+ padding_right, output);
}
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right,
- ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+ padding_right, output->info()));
_input = input;
_output = output;
@@ -152,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -165,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
- _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(
+ compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
return Status{};
}
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -217,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
add_3D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
new file mode 100644
index 0000000000..f9dce9db47
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
+#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the space to batch kernel */
+class CLSpaceToBatchLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToBatchLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchLayerKernel(const CLSpaceToBatchLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchLayerKernel &operator=(const CLSpaceToBatchLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchLayerKernel(CLSpaceToBatchLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchLayerKernel &operator=(CLSpaceToBatchLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToBatchLayerKernel() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+ * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+ * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output);
+ /** Initialise the kernel's input and output. (Static block shape and paddings)
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] padding_left The padding at the beginning of every dimension of the output tensor.
+ * @param[in] padding_right The padding at the end of every dimension of the output tensor.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
+ /** Initialise the kernel's input and output. (Static block shape and paddings)
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] padding_left The padding at the beginning of every dimension of the output tensor.
+ * @param[in] padding_right The padding at the end of every dimension of the output tensor.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+ * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
+ * @param[in] output Tensor output. Data types supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] padding_left The padding at the beginning of every dimension of the output tensor.
+ * @param[in] padding_right The padding at the end of every dimension of the output tensor.
+ * @param[in] output Tensor output. Data types supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ const ICLTensor *_block_shape; /**< Block shape tensor for dynamic evaluation */
+ const ICLTensor *_paddings; /**< Paddings tensor for dynamic evaluation */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 072e992735..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -61,9 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -71,11 +75,15 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ auto padding_info = get_padding_info({input, output});
- TensorShape output_shape = compute_depth_to_space_shape(input->info(), block_shape);
+ TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
@@ -89,15 +97,19 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
// Create kernel
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
- _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+ _kernel = create_kernel(compile_context,
+ "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+ build_opts.options());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
ICLKernel::configure_internal(win);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLSpaceToDepthLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -130,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
enqueue(queue, *this, slice_out, lws_hint());
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
new file mode 100644
index 0000000000..d0932919e0
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
+#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the space to depth kernel */
+class CLSpaceToDepthLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToDepthLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthLayerKernel(const CLSpaceToDepthLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthLayerKernel &operator=(const CLSpaceToDepthLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthLayerKernel(CLSpaceToDepthLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthLayerKernel &operator=(CLSpaceToDepthLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToDepthLayerKernel() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ */
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 33797d7e18..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,21 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute::misc::shape_calculator;
@@ -44,7 +42,11 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -53,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_stack_shape(*input, axis, num_tensors));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -63,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -75,17 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
}
} // namespace
-CLStackLayerKernel::CLStackLayerKernel()
- : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
}
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -98,7 +108,7 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
// Add build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DAXIS=" + support::cpp11::to_string(axis));
build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(input->info()->dimension(2)));
build_opts.add_option("-DDST_DIM3=" + support::cpp11::to_string(output->info()->dimension(3)));
@@ -113,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
_kernel.setArg<cl_uint>(idx, idx_input);
}
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
new file mode 100644
index 0000000000..d3c17f529c
--- /dev/null
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLSTACKLAYERKERNEL_H
+#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
+class CLStackLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLStackLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLStackLayerKernel(const CLStackLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLStackLayerKernel &operator=(const CLStackLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLStackLayerKernel(CLStackLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLStackLayerKernel &operator=(CLStackLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLStackLayerKernel() = default;
+ /** Initialise the kernel's inputs and output
+ *
+ * @note Supported input tensor rank: up to 4
+ *
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+ * @param[in] idx_input Index of the input tensor in the list of tensors to stack.
+ * All tensors in the list must have the same shape
+ * @param[in] num_tensors Number of tensors to stack
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ *
+ */
+ void configure(
+ const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+ /** Initialise the kernel's inputs and output
+ *
+ * @note Supported input tensor rank: up to 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+ * @param[in] idx_input Index of the input tensor in the list of tensors to stack.
+ * All tensors in the list must have the same shape
+ * @param[in] num_tensors Number of tensors to stack
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
+ *
+ * @note Supported input tensor rank: up to 4
+ *
+ * @param[in] input Input tensor info. Data types supported: All.
+ * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+ * @param[in] idx_input Index of the input tensor in the list of tensors to stack
+ * All tensors in the list must have the same shape
+ * @param[in] num_tensors Number of tensors to stack
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSTACKLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 18f02275e8..20cd835069 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,21 +21,32 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
+#include "support/Cast.h"
#include "support/StringSupport.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -44,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
- {
- return i == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
// Get expected output shape
- const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
// Checks output if configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -65,93 +73,77 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
-{
- // Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
- // Create window
- Window win = calculate_max_window(*output, Steps());
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
- return std::make_pair(Status{}, win);
-}
} // namespace
CLStridedSliceKernel::CLStridedSliceKernel()
- : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-}
-
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-
- _input = input;
- _output = output;
+ auto padding_info = get_padding_info({input, output});
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
- const TensorShape &input_shape = input->info()->tensor_shape();
+ const TensorShape &input_shape = input->tensor_shape();
Coordinates starts_abs;
Coordinates ends_abs;
Coordinates final_strides;
- std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
- input_shape,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ std::tie(starts_abs, ends_abs, final_strides) =
+ arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+ Window win = calculate_max_window(*output, Steps());
// Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
- const int vec_size_x = 16 / input->info()->element_size();
- const int output_width_x = output->info()->tensor_shape().x();
+ const int vec_size_x = 16 / input->element_size();
+ const int output_width_x = output->tensor_shape().x();
const bool is_shrink_on_x = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
// Update window if needed
- if(multi_access_x)
+ if (multi_access_x)
{
- Window &updated_window = std::get<1>(win_config);
+ Window &updated_window = win;
updated_window.set(Window::DimX,
- Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+ Window::Dimension(updated_window.x().start(),
+ ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
}
- ICLKernel::configure_internal(win_config.second);
+ ICLKernel::configure_internal(win);
// Create build options
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ build_opts.add_option("-DDATA_TYPE=" +
+ get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
- build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
- build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+ build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(starts_abs[i]));
+ build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+ support::cpp11::to_string(final_strides[i]));
build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
}
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+ std::max<int>(output_width_x - vec_size_x, 0)));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
- "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
- "-DSRC_DEPTH=1");
- build_opts.add_option_if_else(_output->info()->num_dimensions() > 2,
- "-DDST_DEPTH=" + support::cpp11::to_string(_output->info()->tensor_shape().z()),
+ build_opts.add_option_if_else(output->num_dimensions() > 2,
+ "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
"-DDST_DEPTH=1");
// Create kernel
@@ -160,11 +152,11 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
// Set config_id for enabling LWS tuning
_config_id = "strided_slice";
_config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ _config_id += lower_string(string_from_data_type(input->data_type()));
+ for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
_config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(i));
+ _config_id += support::cpp11::to_string(input->dimension(i));
_config_id += "_";
_config_id += support::cpp11::to_string(starts_abs[i]);
_config_id += "_";
@@ -172,35 +164,42 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
_config_id += "_";
_config_id += support::cpp11::to_string(final_strides[i]);
}
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
- starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
return Status{};
}
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = window_collapsed.first_slice_window_4D();
do
{
unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice);
+ add_4D_tensor_argument(idx, src, slice);
+ add_4D_tensor_argument(idx, dst, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_4D(slice));
+ } while (window_collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
new file mode 100644
index 0000000000..1cf5bcacec
--- /dev/null
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
+#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface for the kernel to perform tensor strided slicing */
+class CLStridedSliceKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLStridedSliceKernel();
+
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor info. Data type supported: All.
+ * @param[out] output Destination tensor info. Data type supported: Same as @p input
+ * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] begin_mask If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+ * @param[in] end_mask If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+ * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+ * A slice of size 1 starting from starts[i] in the dimension must be preserved.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] input Source tensor. Data type supported: All.
+ * @param[in] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+ * @param[in] begin_mask If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+ * @param[in] end_mask If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+ * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+ * A slice of size 1 starting from starts[i] in the dimension must be preserved.
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
deleted file mode 100644
index 07827d5bdd..0000000000
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstdint>
-#include <string>
-
-using namespace arm_compute;
-
-void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookupKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(lut == nullptr);
- ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- // Create kernel
- std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set lut argument
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, lut->cl_buffer());
- if(DataType::S16 == lut->type())
- {
- _kernel.setArg(idx++, lut->index_offset());
- _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
- }
-
- // Configure kernel
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
deleted file mode 100644
index 4f984632bc..0000000000
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <string>
-
-using namespace arm_compute;
-
-void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
- uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
-}
-
-void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
- uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
- // Construct kernel name
- std::string kernel_name = "threshold";
-
- switch(type)
- {
- case ThresholdType::BINARY:
- kernel_name += "_binary";
- break;
- case ThresholdType::RANGE:
- kernel_name += "_range";
- break;
- default:
- ARM_COMPUTE_ERROR("Thresholding type not recognized");
- break;
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name);
-
- // Set arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg(idx++, false_value);
- _kernel.setArg(idx++, true_value);
- _kernel.setArg(idx++, threshold);
-
- if(ThresholdType::RANGE == type)
- {
- _kernel.setArg(idx++, upper);
- }
-
- // Make sure _kernel is initialized before calling the parent's configure
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-}
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 2838251cc2..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -37,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
- {
- return e == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -53,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-CLTileKernel::CLTileKernel()
- : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
{
+ _type = CLKernelType::ELEMENTWISE;
}
void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
@@ -63,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
}
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -77,11 +82,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
_input = input;
_output = output;
- const DataType data_type = input->info()->data_type();
- const int vec_size_x = 16 / input->info()->element_size();
- const int input_width_x = input->info()->tensor_shape().x();
- const unsigned int offset = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
- const bool multi_access_x = (input_width_x / vec_size_x > 0);
+ const DataType data_type = input->info()->data_type();
+ const int vec_size_x = 16 / input->info()->element_size();
+ const int input_width_x = input->info()->tensor_shape().x();
+ const unsigned int input_width_ceil = ceil_to_multiple(input_width_x, vec_size_x);
+ const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+ const unsigned int offset = input_width_ceil - input_width_x;
+ const bool multi_access_x = (input_width_x / vec_size_x > 0);
// Create kernel
CLBuildOptions build_opts;
@@ -93,20 +100,20 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
_kernel = create_kernel(compile_context, "tile", build_opts.options());
// Configure window without padding
Window win = calculate_max_window(*output->info());
- if(multi_access_x)
+ if (multi_access_x)
{
// If multi-access is enabled, no thread should cross the tile boundaries. This means we need
// as many threads as those to cover a single tile times multiples[0]. Note that if threads
// do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
// we don't need to pad the output
const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+ win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
}
ICLKernel::configure_internal(win);
@@ -115,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
_config_id = "tile";
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->info()->data_type()));
- for(unsigned int i = 0; i < multiples.size(); ++i)
+ for (unsigned int i = 0; i < multiples.size(); ++i)
{
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -144,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
add_4D_tensor_argument(idx, _input, slice);
add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_4D(slice));
+ } while (collapsed.slide_window_slice_4D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
new file mode 100644
index 0000000000..c3486aecef
--- /dev/null
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTILEKERNEL_H
+#define ARM_COMPUTE_CLTILEKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a Tile operation */
+class CLTileKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLTileKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTileKernel(const CLTileKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTileKernel &operator=(const CLTileKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTileKernel(CLTileKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTileKernel &operator=(CLTileKernel &&) = default;
+ /** Default destructor */
+ ~CLTileKernel() = default;
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] input Source tensor. Data type supported: All.
+ * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
+ * Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+ * @param[out] output Destination tensor. Same as @p input
+ *
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data type supported: All.
+ * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
+ * Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+ * @param[out] output Destination tensor. Same as @p input
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
+ *
+ * @param[in] input Source tensor info. Data type supported: All.
+ * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
+ * Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
+ * @param[in] output Destination tensor info. Same as @p input
+ *
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLTILEKERNEL_H */
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
deleted file mode 100644
index a28b685cb2..0000000000
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape transposed_tensor_shape(const TensorShape &in)
-{
- TensorShape output_shape{ in };
- const size_t w_out = in[1];
- const size_t h_out = in[0];
- output_shape.set(0, w_out);
- output_shape.set(1, h_out);
-
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access);
-
- if(output->total_size() != 0)
- {
- AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-
- window_changed = window_changed || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-Status CLTransposeKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
- return Status{};
-}
-
-void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLTransposeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- std::set<std::string> build_opts;
- std::ostringstream data_type_in_bytes;
- data_type_in_bytes << input->info()->element_size();
- build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str());
-
- _kernel = create_kernel(compile_context, "transpose", build_opts);
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second, cl::NDRange(2, 8));
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
deleted file mode 100644
index dd6f85fe12..0000000000
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-CLUpsampleLayerKernel::CLUpsampleLayerKernel()
- : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x()
-{
-}
-
-Status CLUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(upsampling_policy);
-
- DataLayout data_layout = input->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-
- return Status{};
-}
-
-void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy);
-}
-
-void CLUpsampleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(upsampling_policy);
-
- _input = input;
- _output = output;
- _info = info;
- _data_layout = input->info()->data_layout();
- _num_elems_processed_per_iteration_input_x = 1;
-
- TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
- output->info()->set_data_layout(_data_layout);
-
- unsigned int num_elems_processed_per_iteration_x = 16;
- const int output_width_x = output->info()->dimension(0);
- const bool multi_access_x = ((output_width_x / num_elems_processed_per_iteration_x) > 0);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLUpsampleLayerKernel::validate(input->info(), output->info(), info, upsampling_policy));
-
- Window win{};
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- win = calculate_max_window(*output->info());
- win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.y()));
- if(multi_access_x)
- {
- _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x / info.x();
- win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), num_elems_processed_per_iteration_x), num_elems_processed_per_iteration_x));
- }
- break;
- }
- case DataLayout::NHWC:
- {
- win = calculate_max_window(*output->info());
- win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.x()));
- win.set(Window::DimZ, Window::Dimension(win.z().start(), win.z().end(), info.y()));
- if(multi_access_x)
- {
- _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x;
- win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(),
- num_elems_processed_per_iteration_x),
- num_elems_processed_per_iteration_x));
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_IN=" + support::cpp11::to_string(_num_elems_processed_per_iteration_input_x));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_IN=" + support::cpp11::to_string(std::max<int>(_input->info()->dimension(0) - _num_elems_processed_per_iteration_input_x, 0)));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_OUT=" + support::cpp11::to_string(std::max<int>(output_width_x - num_elems_processed_per_iteration_x, 0)));
- _kernel = create_kernel(compile_context, "upsample_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice_out = collapsed_window.first_slice_window_3D();
- Window slice_in = collapsed_window.first_slice_window_3D();
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
- slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
- break;
- case DataLayout::NHWC:
- slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
- slice_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out, lws_hint());
- }
- while(collapsed_window.slide_window_slice_3D(slice_out) && collapsed_window.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
deleted file mode 100644
index c40c614687..0000000000
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
- for(size_t i = 0; i < 6; ++i)
- {
- std::stringstream mat_str;
- mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
- options.insert(mat_str.str());
- }
-}
-} // namespace
-
-BorderSize CLWarpAffineKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpAffineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
- _input = input;
- _output = output;
-
- // Create build options
- std::set<std::string> options;
- options_add_matrix(options, matrix);
- options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
- // Create kernel
- std::string interpolation_name = string_from_interpolation_policy(policy);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- const std::string kernel_name = "warp_affine_" + interpolation_name;
- _kernel = create_kernel(compile_context, kernel_name, options);
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
- _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
- int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
- const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
- AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(3));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(3));
- _config_id += "_";
- _config_id += lower_string(string_from_interpolation_policy(policy));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
deleted file mode 100644
index bc08549b55..0000000000
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstddef>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void options_add_matrix(std::set<std::string> &options, const std::array<float, 9> &matrix)
-{
- for(size_t i = 0; i < 9; ++i)
- {
- std::stringstream mat_str;
- mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
- options.insert(mat_str.str());
- }
-}
-} // namespace
-
-BorderSize CLWarpPerspectiveKernel::border_size() const
-{
- return BorderSize(1);
-}
-
-void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy);
-}
-
-void CLWarpPerspectiveKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
-
- _input = input;
- _output = output;
-
- // Create build options
- std::set<std::string> options;
- options_add_matrix(options, matrix);
- options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
- // Create kernel
- std::string interpolation_name = string_from_interpolation_policy(policy);
- std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "warp_perspective_" + interpolation_name;
- _kernel = create_kernel(compile_context, kernel_name, options);
-
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
- _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
deleted file mode 100644
index 873f3b3022..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-CLWeightsReshapeKernel::CLWeightsReshapeKernel()
- : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, biases, output, num_groups);
-}
-
-void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr), num_groups)));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(), num_groups));
-
- const DataType data_type = input->info()->data_type();
-
- _biases = biases;
- _output = output;
- _input = input;
-
- // Create build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
- build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
- build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
-
- // Create kernel
- _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
-
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps());
- // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure_internal(win);
-}
-
-Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output, num_groups));
- return Status{};
-}
-
-void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window in_slice = window.first_slice_window_3D();
- Window out_slice = out_window.first_slice_window_2D();
-
- Window biases_window;
- Window biases_slice;
-
- unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
- idx += (_biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
- _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
-
- if(_biases != nullptr)
- {
- biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
- biases_slice = biases_window.first_slice_window_1D();
- }
-
- do
- {
- // Set arguments
- unsigned idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- if(_biases != nullptr)
- {
- add_1D_tensor_argument(idx, _biases, biases_slice);
- ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
- }
-
- // Run kernel
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
deleted file mode 100644
index aba2af1bb7..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- // The window needs to be based on the output
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration), input1->dimension(1));
- const unsigned int input2_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(
- 0) + num_elems_processed_per_iteration - input2->dimension(0);
- AccessWindowStatic input2_access(input2, -(input1->dimension(0) % num_elems_processed_per_iteration),
- 0, input2->dimension(0) + input2_right_padding, input2->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
-
- Window win_collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
-}
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
-
- for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
- }
- ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
-
- return Status{};
-}
-} // namespace
-
-CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
- return Status{};
-}
-
-void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
-
- // If input have different quantization info set quantization parameters needed for the re-quantization process
- const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info());
- if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
- {
- const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
- build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
- build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- // Pass paddings as arguments to the kernel
- const unsigned int input1_width = input1->info()->dimension(0);
- const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
- const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
- unsigned int idx0 = 3 * num_arguments_per_4D_tensor();
- _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
- _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
-
- // Set config_id for enabling LWS tuning
- _config_id = "concatenate_width_x2_";
- _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(1));
-}
-
-void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_4D();
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input1, slice);
- add_4D_tensor_argument(idx, _input2, slice);
- add_4D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, window, lws_hint());
- }
- while(window.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
deleted file mode 100644
index e5eb8b3f55..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
-{
- const unsigned int input1_width = input1->dimension(0);
- const unsigned int input2_width = input2->dimension(0);
- const unsigned int input3_width = input3->dimension(0);
- const unsigned int input4_width = input4->dimension(0);
-
- // The window needs to be based on the output
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1_width, num_elems_processed_per_iteration), input1->dimension(1));
-
- const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
- const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
- input2_width;
- AccessWindowStatic input2_access(input2, -input2_left_padding, 0, input2_width + input2_right_padding, input2->dimension(1));
-
- const unsigned int input3_left_padding = (input1_width + input2_width) % num_elems_processed_per_iteration;
- const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
- num_elems_processed_per_iteration - input3_width;
- AccessWindowStatic input3_access(input3, -input3_left_padding, 0, input3_width + input3_right_padding, input3->dimension(1));
-
- const unsigned int input4_left_padding = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
- const unsigned int input4_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration + num_elems_processed_per_iteration - output->dimension(0);
- AccessWindowStatic input4_access(input4, -input4_left_padding, 0, input4_width + input4_right_padding, input4->dimension(1));
-
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
-
- Window win_collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
-}
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
-
- for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
- }
- ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
-
- return Status{};
-}
-} // namespace
-
-CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
- : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
-{
-}
-
-Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
- return Status{};
-}
-
-void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, input3, input4, output);
-}
-
-void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4,
- ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _input3 = input3;
- _input4 = input4;
- _output = output;
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
- build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
- build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
-
- // If input have different quantization info set quantization parameters needed for the re-quantization process
- const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info(), input3->info(), input4->info());
- if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
- {
- const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq3_info = input3->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq4_info = input4->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
- build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
- build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
- build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
- build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
- build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
- build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- // Pass paddings as arguments to the kernel
- const unsigned int input1_width = input1->info()->dimension(0);
- const unsigned int input2_width = input2->info()->dimension(0);
- const unsigned int input3_width = input3->info()->dimension(0);
-
- const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
- const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
- const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
- input2_width;
- const unsigned int input3_left_padding = (input1_width + input2_width) % num_elems_processed_per_iteration;
- const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
- num_elems_processed_per_iteration - input3_width;
- const unsigned int input4_left_padding = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
- unsigned int idx0 = 5 * num_arguments_per_4D_tensor();
- _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
- _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
- _kernel.setArg<cl_uint>(idx0++, input2_right_padding);
- _kernel.setArg<cl_uint>(idx0++, input3_left_padding);
- _kernel.setArg<cl_uint>(idx0++, input3_right_padding);
- _kernel.setArg<cl_uint>(idx0++, input4_left_padding);
-
- // Set config_id for enabling LWS tuning
- _config_id = "concatenate_width_x4_";
- _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input1->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input2->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input3->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input3->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input4->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input4->info()->dimension(1));
-}
-
-void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_4D();
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input1, slice);
- add_4D_tensor_argument(idx, _input2, slice);
- add_4D_tensor_argument(idx, _input3, slice);
- add_4D_tensor_argument(idx, _input4, slice);
- add_4D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, window, lws_hint());
- }
- while(window.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
deleted file mode 100644
index 8eba293487..0000000000
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
-{
- // The window needs to be based on input as we copy all the widths of input
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- Window win_collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
-}
-Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
-
- for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
- }
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- return Status{};
-}
-} // namespace
-
-CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _width_offset(0)
-{
-}
-
-Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), width_offset, output->clone().get()).first);
- return Status{};
-}
-
-void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, width_offset, output);
-}
-
-void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info()));
-
- _input = input;
- _output = output;
- _width_offset = width_offset;
-
- // Add build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
- build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
- {
- const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
-
- build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
- build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
- }
-
- // Create kernel
- _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
-
- // Set output valid region
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-}
-
-void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, window);
- add_4D_tensor_argument(idx, _output, window);
- enqueue(queue, *this, window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
deleted file mode 100644
index 6ced0a1778..0000000000
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
-
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
- const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
- const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
- bool window_changed = false;
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
- Window win_collapsed = win.collapse(win, Window::DimZ);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win_collapsed);
-}
-} // namespace
-
-CLWinogradFilterTransformKernel::CLWinogradFilterTransformKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradFilterTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
-
- // Create kernel
- std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(input->info()->data_layout()));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Setup output window
- Window window_out;
- window_out.use_tensor_dimensions(_output->info()->tensor_shape(), 0);
-
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, window);
- add_3D_tensor_argument(idx, _output, window_out);
- enqueue(queue, *this, window, lws_hint());
-}
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
deleted file mode 100644
index 09154536ef..0000000000
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
-
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_UNUSED(output_tile_size);
- ARM_COMPUTE_UNUSED(kernel_size);
-
- // Validate configured output
- if(output->total_size() != 0)
- {
- const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- bool window_changed = false;
- Window win = calculate_max_window(*input, Steps(1, 1));
-
- if(input->data_layout() == DataLayout::NCHW)
- {
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
-
- unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
- unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-
- AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
- window_changed = update_window_and_padding(win, input_access);
- }
- else
- {
- AccessWindowStatic input_access(input, 0, -1, input->dimension(0), input->dimension(1) + 1);
- window_changed = update_window_and_padding(win, input_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
- : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
-{
-}
-
-BorderSize CLWinogradInputTransformKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
-
- _data_layout = input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute number of elements to process in the X and Y direction
- const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
- const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
-
- if(_data_layout == DataLayout::NCHW)
- {
- // Check if we need to extend the right or bottom border
- const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
- const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
-
- _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right() + extra_border_right, conv_info.pad_bottom() + extra_border_bottom, conv_info.pad_left());
- }
- else
- {
- _border_size = BorderSize(1U, 0U, 1U, 0);
- }
-
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input->info()->dimension(idx_w), input->info()->dimension(idx_h)),
- kernel_size,
- output_tile_size,
- conv_info);
-
- _input = input;
- _output = output;
- _num_tiles_x = num_tiles.width;
- _num_tiles_y = num_tiles.height;
-
- const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
- const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
-
- CLBuildOptions build_opts;
- build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
- build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
- if(_data_layout == DataLayout::NHWC)
- {
- build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
- build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
- build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
- }
- else
- {
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
- }
-
- // Create kernel
- std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
-
- // Get the maximum dimension from the tile size
- const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
-
- // Check optimized kernel if output_dims == 2x2
- if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
- {
- _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
- }
-
- // Append stepz and data layout
- kernel_name += "_stepz";
- kernel_name += support::cpp11::to_string(_step_z);
- kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
-
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Create window and update padding
- auto win_config = validate_and_configure_window(input->info(), output->info(), winograd_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
- _config_id = kernel_name;
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(2));
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_info.pad_left());
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_info.pad_top());
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), winograd_info).first);
-
- return Status{};
-}
-
-void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const size_t total_batches = window.shape().total_size_upper(3);
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- Window slice = window_collapsed.first_slice_window_3D();
- slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
- slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
- if(_data_layout == DataLayout::NHWC)
- {
- slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
- }
-
- ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
- slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
-
- unsigned int idx = 2 * num_arguments_per_3D_tensor();
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
deleted file mode 100644
index 96383ff11d..0000000000
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
-
- const PadStrideInfo conv_info = winograd_info.convolution_info;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D input_dimensions = winograd_info.input_dimensions;
- const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
-
- // Compute number of elements to process in the X and Y direction
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
- kernel_size,
- output_tile_size,
- conv_info);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
-
- int output_static_window_end_x = 0;
- int output_static_window_end_y = 0;
-
- if(output->data_layout() == DataLayout::NCHW)
- {
- output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
- output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
- }
- else
- {
- output_static_window_end_x = output->dimension(0);
- output_static_window_end_y = std::max(ceil_to_multiple(output->dimension(1), output_tile_size.width), output->dimension(1) + 1 /* For out of bound reads towards the z axis */);
- }
-
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr), _is_nhwc(false)
-{
-}
-
-void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, winograd_info, act_info);
-}
-
-void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
-
- _input = input;
- _bias = bias;
- _output = output;
- _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
-
- // Compute num_tiles_x
- const Size2D input_dimensions = winograd_info.input_dimensions;
- const Size2D kernel_size = winograd_info.kernel_size;
- const Size2D output_tile_size = winograd_info.output_tile_size;
- const PadStrideInfo conv_info = winograd_info.convolution_info;
-
- // Compute the number of output tiles along the x and y direction of size "output_tile_size"
- const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
- kernel_size,
- output_tile_size,
- conv_info);
- const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
- build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
- if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
- {
- build_opts.add_option("-DVEC_SIZE=2");
- }
- else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
- {
- build_opts.add_option("-DVEC_SIZE=4");
- }
-
- build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
- build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
- build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
- build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
- build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
- build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
-
- // Create kernel
- std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info.output_tile_size);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
-}
-
-Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
-
- return Status{};
-}
-
-void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- // Collapse window
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- // Get initial windows
- Window slice = window_collapsed.first_slice_window_4D();
- slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- // Setup output slice
- Window slice_out(slice);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- if(_bias != nullptr)
- {
- unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
- Window slice_biases;
- slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
- add_1D_tensor_argument(idx1, _bias, slice_biases);
- }
-
- if(_is_nhwc)
- {
- unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
- _kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
- }
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
deleted file mode 100644
index 3a9f822eae..0000000000
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Types.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
-
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % (num_classes + 5)) != 0);
-
- // Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- if(output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
- }
-
- const bool is_nchw = input->data_layout() == DataLayout::NCHW;
- const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->element_size() : 1;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
-
- if(output != nullptr)
- {
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
- else
- {
- window_changed = update_window_and_padding(win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLYOLOLayerKernel::CLYOLOLayerKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
-{
-}
-
-void CLYOLOLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes);
-}
-
-void CLYOLOLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
- _run_in_place = (output == nullptr) || (output == input);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info, num_classes));
-
- const bool is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->info()->element_size() : 1;
- const DataType dt = input->info()->data_type();
- float a_const = act_info.a();
- float b_const = act_info.b();
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
- build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(a_const));
- build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(b_const));
- build_opts.add_option("-DNUM_CLASSES=" + support::cpp11::to_string(num_classes));
- build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-
- // Create kernel
- std::string kernel_name = std::string("yolo_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Make sure _kernel is initialized before calling the parent's configure
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "yolo_layer_";
- _config_id += lower_string(string_from_data_type(dt));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
- _config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
-}
-
-Status CLYOLOLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
- const bool run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info, num_classes));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
-
- return Status{};
-}
-
-void CLYOLOLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- if(!_run_in_place)
- {
- add_3D_tensor_argument(idx, _output, slice);
- }
- enqueue(queue, *this, slice, lws_hint());
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute